diff options
author | Akira Fujita <a-fujita@rs.jp.nec.com> | 2009-11-23 07:24:43 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2009-11-23 07:24:43 -0500 |
commit | fc04cb49a898c372a22b21fffc47f299d8710801 (patch) | |
tree | 668bc49ab0a89c6260d78b31eb09b7c60a27bdac /fs/ext4 | |
parent | f868a48d06f8886cb0367568a12367fa4f21ea0d (diff) |
ext4: fix lock order problem in ext4_move_extents()
ext4_move_extents() checks the logical block contiguousness
of original file with ext4_find_extent() and mext_next_extent().
Therefore the extent which ext4_ext_path structure indicates
must not be changed between above functions.
But in current implementation, there is no i_data_sem protection
between ext4_ext_find_extent() and mext_next_extent(). So the extent
which ext4_ext_path structure indicates may be overwritten by
delalloc. As a result, ext4_move_extents() will exchange wrong blocks
between original and donor files. I change the place where
acquire/release i_data_sem to solve this problem.
Moreover, I changed move_extent_per_page() to start transaction first,
and then acquire i_data_sem. Without this change, there is a
possibility of the deadlock between mmap() and ext4_move_extents():
* NOTE: "A", "B" and "C" mean different processes
A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes.
B: do_page_fault() starts the transaction (T),
and then tries to acquire i_data_sem.
But process "A" is already holding it, so it is kept waiting.
C: While "A" and "B" running, kjournald2 tries to commit transaction (T)
but it is under updating, so kjournald2 waits for it.
A-2: Call ext4_journal_start with holding i_data_sem,
but transaction (T) is locked.
Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/move_extent.c | 117 |
1 files changed, 53 insertions, 64 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 83f8c9e47c60..a7410b34c5ed 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c | |||
@@ -77,12 +77,14 @@ static int | |||
77 | mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | 77 | mext_next_extent(struct inode *inode, struct ext4_ext_path *path, |
78 | struct ext4_extent **extent) | 78 | struct ext4_extent **extent) |
79 | { | 79 | { |
80 | struct ext4_extent_header *eh; | ||
80 | int ppos, leaf_ppos = path->p_depth; | 81 | int ppos, leaf_ppos = path->p_depth; |
81 | 82 | ||
82 | ppos = leaf_ppos; | 83 | ppos = leaf_ppos; |
83 | if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { | 84 | if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { |
84 | /* leaf block */ | 85 | /* leaf block */ |
85 | *extent = ++path[ppos].p_ext; | 86 | *extent = ++path[ppos].p_ext; |
87 | path[ppos].p_block = ext_pblock(path[ppos].p_ext); | ||
86 | return 0; | 88 | return 0; |
87 | } | 89 | } |
88 | 90 | ||
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, | |||
119 | ext_block_hdr(path[cur_ppos+1].p_bh); | 121 | ext_block_hdr(path[cur_ppos+1].p_bh); |
120 | } | 122 | } |
121 | 123 | ||
124 | path[leaf_ppos].p_ext = *extent = NULL; | ||
125 | |||
126 | eh = path[leaf_ppos].p_hdr; | ||
127 | if (le16_to_cpu(eh->eh_entries) == 0) | ||
128 | /* empty leaf is found */ | ||
129 | return -ENODATA; | ||
130 | |||
122 | /* leaf block */ | 131 | /* leaf block */ |
123 | path[leaf_ppos].p_ext = *extent = | 132 | path[leaf_ppos].p_ext = *extent = |
124 | EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); | 133 | EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); |
134 | path[leaf_ppos].p_block = | ||
135 | ext_pblock(path[leaf_ppos].p_ext); | ||
125 | return 0; | 136 | return 0; |
126 | } | 137 | } |
127 | } | 138 | } |
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2, | |||
155 | } | 166 | } |
156 | 167 | ||
157 | /** | 168 | /** |
158 | * mext_double_down_read - Acquire two inodes' read semaphore | 169 | * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem |
159 | * | ||
160 | * @orig_inode: original inode structure | ||
161 | * @donor_inode: donor inode structure | ||
162 | * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. | ||
163 | */ | ||
164 | static void | ||
165 | mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) | ||
166 | { | ||
167 | struct inode *first = orig_inode, *second = donor_inode; | ||
168 | |||
169 | /* | ||
170 | * Use the inode number to provide the stable locking order instead | ||
171 | * of its address, because the C language doesn't guarantee you can | ||
172 | * compare pointers that don't come from the same array. | ||
173 | */ | ||
174 | if (donor_inode->i_ino < orig_inode->i_ino) { | ||
175 | first = donor_inode; | ||
176 | second = orig_inode; | ||
177 | } | ||
178 | |||
179 | down_read(&EXT4_I(first)->i_data_sem); | ||
180 | down_read(&EXT4_I(second)->i_data_sem); | ||
181 | } | ||
182 | |||
183 | /** | ||
184 | * mext_double_down_write - Acquire two inodes' write semaphore | ||
185 | * | 170 | * |
186 | * @orig_inode: original inode structure | 171 | * @orig_inode: original inode structure |
187 | * @donor_inode: donor inode structure | 172 | * @donor_inode: donor inode structure |
188 | * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. | 173 | * Acquire write lock of i_data_sem of the two inodes (orig and donor) by |
174 | * i_ino order. | ||
189 | */ | 175 | */ |
190 | static void | 176 | static void |
191 | mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) | 177 | double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) |
192 | { | 178 | { |
193 | struct inode *first = orig_inode, *second = donor_inode; | 179 | struct inode *first = orig_inode, *second = donor_inode; |
194 | 180 | ||
@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) | |||
207 | } | 193 | } |
208 | 194 | ||
209 | /** | 195 | /** |
210 | * mext_double_up_read - Release two inodes' read semaphore | 196 | * double_up_write_data_sem - Release two inodes' write lock of i_data_sem |
211 | * | 197 | * |
212 | * @orig_inode: original inode structure to be released its lock first | 198 | * @orig_inode: original inode structure to be released its lock first |
213 | * @donor_inode: donor inode structure to be released its lock second | 199 | * @donor_inode: donor inode structure to be released its lock second |
214 | * Release read semaphore of two inodes (orig and donor). | 200 | * Release write lock of i_data_sem of two inodes (orig and donor). |
215 | */ | 201 | */ |
216 | static void | 202 | static void |
217 | mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) | 203 | double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) |
218 | { | ||
219 | up_read(&EXT4_I(orig_inode)->i_data_sem); | ||
220 | up_read(&EXT4_I(donor_inode)->i_data_sem); | ||
221 | } | ||
222 | |||
223 | /** | ||
224 | * mext_double_up_write - Release two inodes' write semaphore | ||
225 | * | ||
226 | * @orig_inode: original inode structure to be released its lock first | ||
227 | * @donor_inode: donor inode structure to be released its lock second | ||
228 | * Release write semaphore of two inodes (orig and donor). | ||
229 | */ | ||
230 | static void | ||
231 | mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) | ||
232 | { | 204 | { |
233 | up_write(&EXT4_I(orig_inode)->i_data_sem); | 205 | up_write(&EXT4_I(orig_inode)->i_data_sem); |
234 | up_write(&EXT4_I(donor_inode)->i_data_sem); | 206 | up_write(&EXT4_I(donor_inode)->i_data_sem); |
@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode, | |||
688 | int replaced_count = 0; | 660 | int replaced_count = 0; |
689 | int dext_alen; | 661 | int dext_alen; |
690 | 662 | ||
691 | mext_double_down_write(orig_inode, donor_inode); | ||
692 | |||
693 | /* Get the original extent for the block "orig_off" */ | 663 | /* Get the original extent for the block "orig_off" */ |
694 | *err = get_ext_path(orig_inode, orig_off, &orig_path); | 664 | *err = get_ext_path(orig_inode, orig_off, &orig_path); |
695 | if (*err) | 665 | if (*err) |
@@ -785,7 +755,6 @@ out: | |||
785 | kfree(donor_path); | 755 | kfree(donor_path); |
786 | } | 756 | } |
787 | 757 | ||
788 | mext_double_up_write(orig_inode, donor_inode); | ||
789 | return replaced_count; | 758 | return replaced_count; |
790 | } | 759 | } |
791 | 760 | ||
@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
851 | * Just swap data blocks between orig and donor. | 820 | * Just swap data blocks between orig and donor. |
852 | */ | 821 | */ |
853 | if (uninit) { | 822 | if (uninit) { |
823 | /* | ||
824 | * Protect extent trees against block allocations | ||
825 | * via delalloc | ||
826 | */ | ||
827 | double_down_write_data_sem(orig_inode, donor_inode); | ||
854 | replaced_count = mext_replace_branches(handle, orig_inode, | 828 | replaced_count = mext_replace_branches(handle, orig_inode, |
855 | donor_inode, orig_blk_offset, | 829 | donor_inode, orig_blk_offset, |
856 | block_len_in_page, err); | 830 | block_len_in_page, err); |
@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
858 | /* Clear the inode cache not to refer to the old data */ | 832 | /* Clear the inode cache not to refer to the old data */ |
859 | ext4_ext_invalidate_cache(orig_inode); | 833 | ext4_ext_invalidate_cache(orig_inode); |
860 | ext4_ext_invalidate_cache(donor_inode); | 834 | ext4_ext_invalidate_cache(donor_inode); |
835 | double_up_write_data_sem(orig_inode, donor_inode); | ||
861 | goto out2; | 836 | goto out2; |
862 | } | 837 | } |
863 | 838 | ||
@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
905 | /* Release old bh and drop refs */ | 880 | /* Release old bh and drop refs */ |
906 | try_to_release_page(page, 0); | 881 | try_to_release_page(page, 0); |
907 | 882 | ||
883 | /* Protect extent trees against block allocations via delalloc */ | ||
884 | double_down_write_data_sem(orig_inode, donor_inode); | ||
908 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, | 885 | replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, |
909 | orig_blk_offset, block_len_in_page, | 886 | orig_blk_offset, block_len_in_page, |
910 | &err2); | 887 | &err2); |
@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, | |||
913 | block_len_in_page = replaced_count; | 890 | block_len_in_page = replaced_count; |
914 | replaced_size = | 891 | replaced_size = |
915 | block_len_in_page << orig_inode->i_blkbits; | 892 | block_len_in_page << orig_inode->i_blkbits; |
916 | } else | 893 | } else { |
894 | double_up_write_data_sem(orig_inode, donor_inode); | ||
917 | goto out; | 895 | goto out; |
896 | } | ||
918 | } | 897 | } |
919 | 898 | ||
920 | /* Clear the inode cache not to refer to the old data */ | 899 | /* Clear the inode cache not to refer to the old data */ |
921 | ext4_ext_invalidate_cache(orig_inode); | 900 | ext4_ext_invalidate_cache(orig_inode); |
922 | ext4_ext_invalidate_cache(donor_inode); | 901 | ext4_ext_invalidate_cache(donor_inode); |
923 | 902 | ||
903 | double_up_write_data_sem(orig_inode, donor_inode); | ||
904 | |||
924 | if (!page_has_buffers(page)) | 905 | if (!page_has_buffers(page)) |
925 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); | 906 | create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); |
926 | 907 | ||
@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1236 | return -EINVAL; | 1217 | return -EINVAL; |
1237 | } | 1218 | } |
1238 | 1219 | ||
1239 | /* protect orig and donor against a truncate */ | 1220 | /* Protect orig and donor inodes against a truncate */ |
1240 | ret1 = mext_inode_double_lock(orig_inode, donor_inode); | 1221 | ret1 = mext_inode_double_lock(orig_inode, donor_inode); |
1241 | if (ret1 < 0) | 1222 | if (ret1 < 0) |
1242 | return ret1; | 1223 | return ret1; |
1243 | 1224 | ||
1244 | mext_double_down_read(orig_inode, donor_inode); | 1225 | /* Protect extent tree against block allocations via delalloc */ |
1226 | double_down_write_data_sem(orig_inode, donor_inode); | ||
1245 | /* Check the filesystem environment whether move_extent can be done */ | 1227 | /* Check the filesystem environment whether move_extent can be done */ |
1246 | ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, | 1228 | ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, |
1247 | donor_start, &len, *moved_len); | 1229 | donor_start, &len, *moved_len); |
1248 | mext_double_up_read(orig_inode, donor_inode); | ||
1249 | if (ret1) | 1230 | if (ret1) |
1250 | goto out; | 1231 | goto out; |
1251 | 1232 | ||
@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1308 | ext4_ext_get_actual_len(ext_cur), block_end + 1) - | 1289 | ext4_ext_get_actual_len(ext_cur), block_end + 1) - |
1309 | max(le32_to_cpu(ext_cur->ee_block), block_start); | 1290 | max(le32_to_cpu(ext_cur->ee_block), block_start); |
1310 | 1291 | ||
1292 | /* Discard preallocations of two inodes */ | ||
1293 | ext4_discard_preallocations(orig_inode); | ||
1294 | ext4_discard_preallocations(donor_inode); | ||
1295 | |||
1311 | while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { | 1296 | while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { |
1312 | seq_blocks += add_blocks; | 1297 | seq_blocks += add_blocks; |
1313 | 1298 | ||
@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1359 | seq_start = le32_to_cpu(ext_cur->ee_block); | 1344 | seq_start = le32_to_cpu(ext_cur->ee_block); |
1360 | rest_blocks = seq_blocks; | 1345 | rest_blocks = seq_blocks; |
1361 | 1346 | ||
1362 | /* Discard preallocations of two inodes */ | 1347 | /* |
1363 | down_write(&EXT4_I(orig_inode)->i_data_sem); | 1348 | * Up semaphore to avoid following problems: |
1364 | ext4_discard_preallocations(orig_inode); | 1349 | * a. transaction deadlock among ext4_journal_start, |
1365 | up_write(&EXT4_I(orig_inode)->i_data_sem); | 1350 | * ->write_begin via pagefault, and jbd2_journal_commit |
1366 | 1351 | * b. racing with ->readpage, ->write_begin, and ext4_get_block | |
1367 | down_write(&EXT4_I(donor_inode)->i_data_sem); | 1352 | * in move_extent_per_page |
1368 | ext4_discard_preallocations(donor_inode); | 1353 | */ |
1369 | up_write(&EXT4_I(donor_inode)->i_data_sem); | 1354 | double_up_write_data_sem(orig_inode, donor_inode); |
1370 | 1355 | ||
1371 | while (orig_page_offset <= seq_end_page) { | 1356 | while (orig_page_offset <= seq_end_page) { |
1372 | 1357 | ||
@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1381 | /* Count how many blocks we have exchanged */ | 1366 | /* Count how many blocks we have exchanged */ |
1382 | *moved_len += block_len_in_page; | 1367 | *moved_len += block_len_in_page; |
1383 | if (ret1 < 0) | 1368 | if (ret1 < 0) |
1384 | goto out; | 1369 | break; |
1385 | if (*moved_len > len) { | 1370 | if (*moved_len > len) { |
1386 | ext4_error(orig_inode->i_sb, __func__, | 1371 | ext4_error(orig_inode->i_sb, __func__, |
1387 | "We replaced blocks too much! " | 1372 | "We replaced blocks too much! " |
1388 | "sum of replaced: %llu requested: %llu", | 1373 | "sum of replaced: %llu requested: %llu", |
1389 | *moved_len, len); | 1374 | *moved_len, len); |
1390 | ret1 = -EIO; | 1375 | ret1 = -EIO; |
1391 | goto out; | 1376 | break; |
1392 | } | 1377 | } |
1393 | 1378 | ||
1394 | orig_page_offset++; | 1379 | orig_page_offset++; |
@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
1400 | block_len_in_page = rest_blocks; | 1385 | block_len_in_page = rest_blocks; |
1401 | } | 1386 | } |
1402 | 1387 | ||
1388 | double_down_write_data_sem(orig_inode, donor_inode); | ||
1389 | if (ret1 < 0) | ||
1390 | break; | ||
1391 | |||
1403 | /* Decrease buffer counter */ | 1392 | /* Decrease buffer counter */ |
1404 | if (holecheck_path) | 1393 | if (holecheck_path) |
1405 | ext4_ext_drop_refs(holecheck_path); | 1394 | ext4_ext_drop_refs(holecheck_path); |
@@ -1429,7 +1418,7 @@ out: | |||
1429 | ext4_ext_drop_refs(holecheck_path); | 1418 | ext4_ext_drop_refs(holecheck_path); |
1430 | kfree(holecheck_path); | 1419 | kfree(holecheck_path); |
1431 | } | 1420 | } |
1432 | 1421 | double_up_write_data_sem(orig_inode, donor_inode); | |
1433 | ret2 = mext_inode_double_unlock(orig_inode, donor_inode); | 1422 | ret2 = mext_inode_double_unlock(orig_inode, donor_inode); |
1434 | 1423 | ||
1435 | if (ret1) | 1424 | if (ret1) |