summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorQu Wenruo <quwenruo@cn.fujitsu.com>2017-03-07 21:25:51 -0500
committerFilipe Manana <fdmanana@suse.com>2017-04-26 11:27:21 -0400
commit4dbd80fb9176f23c78cecd0a8285001cd2066425 (patch)
tree1c3c6d23ab7b74c714d93d1ebb8b99b82adb5fc1 /fs
parenta967efb30b3afa3d858edd6a17f544f9e9e46eea (diff)
btrfs: Fix metadata underflow caused by btrfs_reloc_clone_csum error
[BUG] When btrfs_reloc_clone_csum() reports error, it can underflow metadata and leads to kernel assertion on outstanding extents in run_delalloc_nocow() and cow_file_range(). BTRFS info (device vdb5): relocating block group 12582912 flags data BTRFS info (device vdb5): found 1 extents assertion failed: inode->outstanding_extents >= num_extents, file: fs/btrfs//extent-tree.c, line: 5858 Currently, due to another bug blocking ordered extents, the bug is only reproducible under certain block group layout and using error injection. a) Create one data block group with one 4K extent in it. To avoid the bug that hangs btrfs due to ordered extent which never finishes b) Make btrfs_reloc_clone_csum() always fail c) Relocate that block group [CAUSE] run_delalloc_nocow() and cow_file_range() handles error from btrfs_reloc_clone_csum() wrongly: (The ascii chart shows a more generic case of this bug other than the bug mentioned above) |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<----------- cleanup range --------------->| |<----------- ----------->| \/ btrfs_finish_ordered_io() range So error handler, which calls extent_clear_unlock_delalloc() with EXTENT_DELALLOC and EXTENT_DO_ACCOUNT bits, and btrfs_finish_ordered_io() will both cover OE n, and free its metadata, causing metadata under flow. [Fix] The fix is to ensure after calling btrfs_add_ordered_extent(), we only call error handler after increasing the iteration offset, so that cleanup range won't cover any created ordered extent. |<------------------ delalloc range --------------------------->| | OE 1 | OE 2 | ... | OE n | |<----------- ----------->|<---------- cleanup range --------->| \/ btrfs_finish_ordered_io() range Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/inode.c51
1 files changed, 39 insertions, 12 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55ed2c4829a8..844bb896f5ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -998,15 +998,24 @@ static noinline int cow_file_range(struct inode *inode,
998 BTRFS_DATA_RELOC_TREE_OBJECTID) { 998 BTRFS_DATA_RELOC_TREE_OBJECTID) {
999 ret = btrfs_reloc_clone_csums(inode, start, 999 ret = btrfs_reloc_clone_csums(inode, start,
1000 cur_alloc_size); 1000 cur_alloc_size);
1001 /*
1002 * Only drop cache here, and process as normal.
1003 *
1004 * We must not allow extent_clear_unlock_delalloc()
1005 * at out_unlock label to free meta of this ordered
1006 * extent, as its meta should be freed by
1007 * btrfs_finish_ordered_io().
1008 *
1009 * So we must continue until @start is increased to
1010 * skip current ordered extent.
1011 */
1001 if (ret) 1012 if (ret)
1002 goto out_drop_extent_cache; 1013 btrfs_drop_extent_cache(BTRFS_I(inode), start,
1014 start + ram_size - 1, 0);
1003 } 1015 }
1004 1016
1005 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1017 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1006 1018
1007 if (disk_num_bytes < cur_alloc_size)
1008 break;
1009
1010 /* we're not doing compressed IO, don't unlock the first 1019 /* we're not doing compressed IO, don't unlock the first
1011 * page (which the caller expects to stay locked), don't 1020 * page (which the caller expects to stay locked), don't
1012 * clear any dirty bits and don't set any writeback bits 1021 * clear any dirty bits and don't set any writeback bits
@@ -1022,10 +1031,21 @@ static noinline int cow_file_range(struct inode *inode,
1022 delalloc_end, locked_page, 1031 delalloc_end, locked_page,
1023 EXTENT_LOCKED | EXTENT_DELALLOC, 1032 EXTENT_LOCKED | EXTENT_DELALLOC,
1024 op); 1033 op);
1025 disk_num_bytes -= cur_alloc_size; 1034 if (disk_num_bytes < cur_alloc_size)
1035 disk_num_bytes = 0;
1036 else
1037 disk_num_bytes -= cur_alloc_size;
1026 num_bytes -= cur_alloc_size; 1038 num_bytes -= cur_alloc_size;
1027 alloc_hint = ins.objectid + ins.offset; 1039 alloc_hint = ins.objectid + ins.offset;
1028 start += cur_alloc_size; 1040 start += cur_alloc_size;
1041
1042 /*
1043 * btrfs_reloc_clone_csums() error, since start is increased
1044 * extent_clear_unlock_delalloc() at out_unlock label won't
1045 * free metadata of current ordered extent, we're OK to exit.
1046 */
1047 if (ret)
1048 goto out_unlock;
1029 } 1049 }
1030out: 1050out:
1031 return ret; 1051 return ret;
@@ -1414,15 +1434,14 @@ out_check:
1414 BUG_ON(ret); /* -ENOMEM */ 1434 BUG_ON(ret); /* -ENOMEM */
1415 1435
1416 if (root->root_key.objectid == 1436 if (root->root_key.objectid ==
1417 BTRFS_DATA_RELOC_TREE_OBJECTID) { 1437 BTRFS_DATA_RELOC_TREE_OBJECTID)
1438 /*
1439 * Error handled later, as we must prevent
1440 * extent_clear_unlock_delalloc() in error handler
1441 * from freeing metadata of created ordered extent.
1442 */
1418 ret = btrfs_reloc_clone_csums(inode, cur_offset, 1443 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1419 num_bytes); 1444 num_bytes);
1420 if (ret) {
1421 if (!nolock && nocow)
1422 btrfs_end_write_no_snapshoting(root);
1423 goto error;
1424 }
1425 }
1426 1445
1427 extent_clear_unlock_delalloc(inode, cur_offset, 1446 extent_clear_unlock_delalloc(inode, cur_offset,
1428 cur_offset + num_bytes - 1, end, 1447 cur_offset + num_bytes - 1, end,
@@ -1434,6 +1453,14 @@ out_check:
1434 if (!nolock && nocow) 1453 if (!nolock && nocow)
1435 btrfs_end_write_no_snapshoting(root); 1454 btrfs_end_write_no_snapshoting(root);
1436 cur_offset = extent_end; 1455 cur_offset = extent_end;
1456
1457 /*
1458 * btrfs_reloc_clone_csums() error, now we're OK to call error
1459 * handler, as metadata for created ordered extent will only
1460 * be freed by btrfs_finish_ordered_io().
1461 */
1462 if (ret)
1463 goto error;
1437 if (cur_offset > end) 1464 if (cur_offset > end)
1438 break; 1465 break;
1439 } 1466 }