aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAkira Fujita <a-fujita@rs.jp.nec.com>2009-11-23 07:24:43 -0500
committerTheodore Ts'o <tytso@mit.edu>2009-11-23 07:24:43 -0500
commitfc04cb49a898c372a22b21fffc47f299d8710801 (patch)
tree668bc49ab0a89c6260d78b31eb09b7c60a27bdac
parentf868a48d06f8886cb0367568a12367fa4f21ea0d (diff)
ext4: fix lock order problem in ext4_move_extents()
ext4_move_extents() checks the logical block contiguousness of original file with ext4_find_extent() and mext_next_extent(). Therefore the extent which ext4_ext_path structure indicates must not be changed between above functions. But in current implementation, there is no i_data_sem protection between ext4_ext_find_extent() and mext_next_extent(). So the extent which ext4_ext_path structure indicates may be overwritten by delalloc. As a result, ext4_move_extents() will exchange wrong blocks between original and donor files. I change the place where acquire/release i_data_sem to solve this problem. Moreover, I changed move_extent_per_page() to start transaction first, and then acquire i_data_sem. Without this change, there is a possibility of the deadlock between mmap() and ext4_move_extents(): * NOTE: "A", "B" and "C" mean different processes A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes. B: do_page_fault() starts the transaction (T), and then tries to acquire i_data_sem. But process "A" is already holding it, so it is kept waiting. C: While "A" and "B" running, kjournald2 tries to commit transaction (T) but it is under updating, so kjournald2 waits for it. A-2: Call ext4_journal_start with holding i_data_sem, but transaction (T) is locked. Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
-rw-r--r--fs/ext4/move_extent.c117
1 files changed, 53 insertions, 64 deletions
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 83f8c9e47c60..a7410b34c5ed 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
77mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 77mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
78 struct ext4_extent **extent) 78 struct ext4_extent **extent)
79{ 79{
80 struct ext4_extent_header *eh;
80 int ppos, leaf_ppos = path->p_depth; 81 int ppos, leaf_ppos = path->p_depth;
81 82
82 ppos = leaf_ppos; 83 ppos = leaf_ppos;
83 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 84 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
84 /* leaf block */ 85 /* leaf block */
85 *extent = ++path[ppos].p_ext; 86 *extent = ++path[ppos].p_ext;
87 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
86 return 0; 88 return 0;
87 } 89 }
88 90
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
119 ext_block_hdr(path[cur_ppos+1].p_bh); 121 ext_block_hdr(path[cur_ppos+1].p_bh);
120 } 122 }
121 123
124 path[leaf_ppos].p_ext = *extent = NULL;
125
126 eh = path[leaf_ppos].p_hdr;
127 if (le16_to_cpu(eh->eh_entries) == 0)
128 /* empty leaf is found */
129 return -ENODATA;
130
122 /* leaf block */ 131 /* leaf block */
123 path[leaf_ppos].p_ext = *extent = 132 path[leaf_ppos].p_ext = *extent =
124 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 133 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
134 path[leaf_ppos].p_block =
135 ext_pblock(path[leaf_ppos].p_ext);
125 return 0; 136 return 0;
126 } 137 }
127 } 138 }
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
155} 166}
156 167
157/** 168/**
158 * mext_double_down_read - Acquire two inodes' read semaphore 169 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
159 *
160 * @orig_inode: original inode structure
161 * @donor_inode: donor inode structure
162 * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
163 */
164static void
165mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
166{
167 struct inode *first = orig_inode, *second = donor_inode;
168
169 /*
170 * Use the inode number to provide the stable locking order instead
171 * of its address, because the C language doesn't guarantee you can
172 * compare pointers that don't come from the same array.
173 */
174 if (donor_inode->i_ino < orig_inode->i_ino) {
175 first = donor_inode;
176 second = orig_inode;
177 }
178
179 down_read(&EXT4_I(first)->i_data_sem);
180 down_read(&EXT4_I(second)->i_data_sem);
181}
182
183/**
184 * mext_double_down_write - Acquire two inodes' write semaphore
185 * 170 *
186 * @orig_inode: original inode structure 171 * @orig_inode: original inode structure
187 * @donor_inode: donor inode structure 172 * @donor_inode: donor inode structure
188 * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. 173 * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
174 * i_ino order.
189 */ 175 */
190static void 176static void
191mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) 177double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
192{ 178{
193 struct inode *first = orig_inode, *second = donor_inode; 179 struct inode *first = orig_inode, *second = donor_inode;
194 180
@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
207} 193}
208 194
209/** 195/**
210 * mext_double_up_read - Release two inodes' read semaphore 196 * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
211 * 197 *
212 * @orig_inode: original inode structure to be released its lock first 198 * @orig_inode: original inode structure to be released its lock first
213 * @donor_inode: donor inode structure to be released its lock second 199 * @donor_inode: donor inode structure to be released its lock second
214 * Release read semaphore of two inodes (orig and donor). 200 * Release write lock of i_data_sem of two inodes (orig and donor).
215 */ 201 */
216static void 202static void
217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 203double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
218{
219 up_read(&EXT4_I(orig_inode)->i_data_sem);
220 up_read(&EXT4_I(donor_inode)->i_data_sem);
221}
222
223/**
224 * mext_double_up_write - Release two inodes' write semaphore
225 *
226 * @orig_inode: original inode structure to be released its lock first
227 * @donor_inode: donor inode structure to be released its lock second
228 * Release write semaphore of two inodes (orig and donor).
229 */
230static void
231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
232{ 204{
233 up_write(&EXT4_I(orig_inode)->i_data_sem); 205 up_write(&EXT4_I(orig_inode)->i_data_sem);
234 up_write(&EXT4_I(donor_inode)->i_data_sem); 206 up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
688 int replaced_count = 0; 660 int replaced_count = 0;
689 int dext_alen; 661 int dext_alen;
690 662
691 mext_double_down_write(orig_inode, donor_inode);
692
693 /* Get the original extent for the block "orig_off" */ 663 /* Get the original extent for the block "orig_off" */
694 *err = get_ext_path(orig_inode, orig_off, &orig_path); 664 *err = get_ext_path(orig_inode, orig_off, &orig_path);
695 if (*err) 665 if (*err)
@@ -785,7 +755,6 @@ out:
785 kfree(donor_path); 755 kfree(donor_path);
786 } 756 }
787 757
788 mext_double_up_write(orig_inode, donor_inode);
789 return replaced_count; 758 return replaced_count;
790} 759}
791 760
@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
851 * Just swap data blocks between orig and donor. 820 * Just swap data blocks between orig and donor.
852 */ 821 */
853 if (uninit) { 822 if (uninit) {
823 /*
824 * Protect extent trees against block allocations
825 * via delalloc
826 */
827 double_down_write_data_sem(orig_inode, donor_inode);
854 replaced_count = mext_replace_branches(handle, orig_inode, 828 replaced_count = mext_replace_branches(handle, orig_inode,
855 donor_inode, orig_blk_offset, 829 donor_inode, orig_blk_offset,
856 block_len_in_page, err); 830 block_len_in_page, err);
@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
858 /* Clear the inode cache not to refer to the old data */ 832 /* Clear the inode cache not to refer to the old data */
859 ext4_ext_invalidate_cache(orig_inode); 833 ext4_ext_invalidate_cache(orig_inode);
860 ext4_ext_invalidate_cache(donor_inode); 834 ext4_ext_invalidate_cache(donor_inode);
835 double_up_write_data_sem(orig_inode, donor_inode);
861 goto out2; 836 goto out2;
862 } 837 }
863 838
@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
905 /* Release old bh and drop refs */ 880 /* Release old bh and drop refs */
906 try_to_release_page(page, 0); 881 try_to_release_page(page, 0);
907 882
883 /* Protect extent trees against block allocations via delalloc */
884 double_down_write_data_sem(orig_inode, donor_inode);
908 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 885 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
909 orig_blk_offset, block_len_in_page, 886 orig_blk_offset, block_len_in_page,
910 &err2); 887 &err2);
@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
913 block_len_in_page = replaced_count; 890 block_len_in_page = replaced_count;
914 replaced_size = 891 replaced_size =
915 block_len_in_page << orig_inode->i_blkbits; 892 block_len_in_page << orig_inode->i_blkbits;
916 } else 893 } else {
894 double_up_write_data_sem(orig_inode, donor_inode);
917 goto out; 895 goto out;
896 }
918 } 897 }
919 898
920 /* Clear the inode cache not to refer to the old data */ 899 /* Clear the inode cache not to refer to the old data */
921 ext4_ext_invalidate_cache(orig_inode); 900 ext4_ext_invalidate_cache(orig_inode);
922 ext4_ext_invalidate_cache(donor_inode); 901 ext4_ext_invalidate_cache(donor_inode);
923 902
903 double_up_write_data_sem(orig_inode, donor_inode);
904
924 if (!page_has_buffers(page)) 905 if (!page_has_buffers(page))
925 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 906 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
926 907
@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1236 return -EINVAL; 1217 return -EINVAL;
1237 } 1218 }
1238 1219
1239 /* protect orig and donor against a truncate */ 1220 /* Protect orig and donor inodes against a truncate */
1240 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1221 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1241 if (ret1 < 0) 1222 if (ret1 < 0)
1242 return ret1; 1223 return ret1;
1243 1224
1244 mext_double_down_read(orig_inode, donor_inode); 1225 /* Protect extent tree against block allocations via delalloc */
1226 double_down_write_data_sem(orig_inode, donor_inode);
1245 /* Check the filesystem environment whether move_extent can be done */ 1227 /* Check the filesystem environment whether move_extent can be done */
1246 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1228 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1247 donor_start, &len, *moved_len); 1229 donor_start, &len, *moved_len);
1248 mext_double_up_read(orig_inode, donor_inode);
1249 if (ret1) 1230 if (ret1)
1250 goto out; 1231 goto out;
1251 1232
@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1308 ext4_ext_get_actual_len(ext_cur), block_end + 1) - 1289 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
1309 max(le32_to_cpu(ext_cur->ee_block), block_start); 1290 max(le32_to_cpu(ext_cur->ee_block), block_start);
1310 1291
1292 /* Discard preallocations of two inodes */
1293 ext4_discard_preallocations(orig_inode);
1294 ext4_discard_preallocations(donor_inode);
1295
1311 while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { 1296 while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
1312 seq_blocks += add_blocks; 1297 seq_blocks += add_blocks;
1313 1298
@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1359 seq_start = le32_to_cpu(ext_cur->ee_block); 1344 seq_start = le32_to_cpu(ext_cur->ee_block);
1360 rest_blocks = seq_blocks; 1345 rest_blocks = seq_blocks;
1361 1346
1362 /* Discard preallocations of two inodes */ 1347 /*
1363 down_write(&EXT4_I(orig_inode)->i_data_sem); 1348 * Up semaphore to avoid following problems:
1364 ext4_discard_preallocations(orig_inode); 1349 * a. transaction deadlock among ext4_journal_start,
1365 up_write(&EXT4_I(orig_inode)->i_data_sem); 1350 * ->write_begin via pagefault, and jbd2_journal_commit
1366 1351 * b. racing with ->readpage, ->write_begin, and ext4_get_block
1367 down_write(&EXT4_I(donor_inode)->i_data_sem); 1352 * in move_extent_per_page
1368 ext4_discard_preallocations(donor_inode); 1353 */
1369 up_write(&EXT4_I(donor_inode)->i_data_sem); 1354 double_up_write_data_sem(orig_inode, donor_inode);
1370 1355
1371 while (orig_page_offset <= seq_end_page) { 1356 while (orig_page_offset <= seq_end_page) {
1372 1357
@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1381 /* Count how many blocks we have exchanged */ 1366 /* Count how many blocks we have exchanged */
1382 *moved_len += block_len_in_page; 1367 *moved_len += block_len_in_page;
1383 if (ret1 < 0) 1368 if (ret1 < 0)
1384 goto out; 1369 break;
1385 if (*moved_len > len) { 1370 if (*moved_len > len) {
1386 ext4_error(orig_inode->i_sb, __func__, 1371 ext4_error(orig_inode->i_sb, __func__,
1387 "We replaced blocks too much! " 1372 "We replaced blocks too much! "
1388 "sum of replaced: %llu requested: %llu", 1373 "sum of replaced: %llu requested: %llu",
1389 *moved_len, len); 1374 *moved_len, len);
1390 ret1 = -EIO; 1375 ret1 = -EIO;
1391 goto out; 1376 break;
1392 } 1377 }
1393 1378
1394 orig_page_offset++; 1379 orig_page_offset++;
@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1400 block_len_in_page = rest_blocks; 1385 block_len_in_page = rest_blocks;
1401 } 1386 }
1402 1387
1388 double_down_write_data_sem(orig_inode, donor_inode);
1389 if (ret1 < 0)
1390 break;
1391
1403 /* Decrease buffer counter */ 1392 /* Decrease buffer counter */
1404 if (holecheck_path) 1393 if (holecheck_path)
1405 ext4_ext_drop_refs(holecheck_path); 1394 ext4_ext_drop_refs(holecheck_path);
@@ -1429,7 +1418,7 @@ out:
1429 ext4_ext_drop_refs(holecheck_path); 1418 ext4_ext_drop_refs(holecheck_path);
1430 kfree(holecheck_path); 1419 kfree(holecheck_path);
1431 } 1420 }
1432 1421 double_up_write_data_sem(orig_inode, donor_inode);
1433 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1422 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1434 1423
1435 if (ret1) 1424 if (ret1)