From d9f85963e3f7f5582552fdae54a2b89d6c62daf5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 25 Aug 2014 10:43:00 +0100 Subject: Btrfs: fix corruption after write/fsync failure + fsync + log recovery While writing to a file, in inode.c:cow_file_range() (and same applies to submit_compressed_extents()), after reserving an extent for the file data, we create a new extent map for the written range and insert it into the extent map cache. After that, we create an ordered operation, but if it fails (due to a transient/temporary-ENOMEM), we return without dropping that extent map, which points to a reserved extent that is freed when we return. A subsequent incremental fsync (when the btrfs inode doesn't have the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and logs a file extent item based on that extent map, which points to a disk extent that doesn't contain valid data - it was freed by us earlier, at this point it might contain any random/garbage data. Therefore, if we reach an error condition when cowing a file range after we added the new extent map to the cache, drop it from the cache before returning. Some sequence of steps that lead to this: $ mkfs.btrfs -f /dev/sdd $ mount -o commit=9999 /dev/sdd /mnt $ cd /mnt $ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo $ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096" $ sync $ od -t x1 foo 0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 * 0020000 $ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo # Now this write + fsync fail with -ENOMEM, which was returned by # btrfs_add_ordered_extent() in inode.c:cow_file_range(). $ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo $ xfs_io -c "fsync" foo fsync: Cannot allocate memory # Now do a new write + fsync, which will succeed. Our previous # -ENOMEM was a transient/temporary error. $ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo $ xfs_io -c "fsync" foo # Our file content (in page cache) is now: $ od -t x1 foo 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 * 0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # Now reboot the machine, and mount the fs, so that fsync log replay # takes place. # The file content is now weird, in particular the first 8Kb, which # do not match our data before nor after the sync command above. $ od -t x1 foo 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # In fact these first 4Kb are a duplicate of the last 4kb block. # The last write got an extent map/file extent item that points to # the same disk extent that we got in the write+fsync that failed # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to # verify that: $ btrfs-debug-tree /dev/sdd (...) item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53 extent data disk byte 12582912 nr 8192 extent data offset 0 nr 8192 ram 8192 item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 8192 ram 8192 item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53 extent data disk byte 12582912 nr 4096 extent data offset 0 nr 4096 ram 4096 $ umount /dev/sdd $ btrfsck /dev/sdd Checking filesystem on /dev/sdd UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f checking extents extent item 12582912 has multiple extent items ref mismatch on [12582912 4096] extent item 1, found 2 Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192 backpointer mismatch on [12582912 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 5 inode 257 errors 1000, some csum missing found 131074 bytes used err is 1 total csum bytes: 4 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123404 file data blocks allocated: 274432 referenced 274432 Btrfs v3.14.1-96-gcc7fd5a-dirty Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d020d6d9ace..7313571e1860 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -778,8 +778,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); goto out_free_reserve; + } /* * clear dirty, set writeback and unlock the pages. @@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) - goto out_reserve; + goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); if (ret) - goto out_reserve; + goto out_drop_extent_cache; } if (disk_num_bytes < cur_alloc_size) @@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode, out: return ret; +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: -- cgit v1.2.2 From dac5705cad20070a70bb028ca52e1f0bc157b42d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Aug 2014 20:54:26 +0100 Subject: Btrfs: fix crash while doing a ranged fsync While doing a ranged fsync, that is, one whose range doesn't cover the whole possible file range (0 to LLONG_MAX), we can crash under certain circumstances with a trace like the following: [41074.641913] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[] [] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [] ? sched_clock_local+0x25/0xa0 [41074.644919] [] ? mutex_unlock+0xe/0x10 [41074.644919] [] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [] ? dget_parent+0x5/0x180 [41074.644919] [] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * a ranged fsync call happens (via an msync for example), with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X > 0 and X > Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() -> btrfs_log_dentry_safe() -> btrfs_log_inode_parent() -> btrfs_log_inode() -> btrfs_truncate_inode_items() -> btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen if the fsync range covered the whole possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to finish before calling btrfs_truncate_inode_items(). So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items from a log tree, which isn't supposed to change the in memory inode's disk_i_size. Issue found while running xfstests/generic/127 (happens very rarely for me), more specifically via the fsx calls that use memory mapped IO (and issue msync calls). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7313571e1860..88823f4ca451 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4248,7 +4248,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- cgit v1.2.2 From b0d5d10f41a0f1cd839408dd94427f2db3553bca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 8 Sep 2014 13:08:51 -0700 Subject: Btrfs: use insert_inode_locked4 for inode creation Btrfs was inserting inodes into the hash table before we had fully set the inode up on disk. This leaves us open to rare races that allow two different inodes in memory for the same [root, inode] pair. This patch fixes things by using insert_inode_locked4 to insert an I_NEW inode and unlock_new_inode when we're ready for the rest of the kernel to use the inode. It also makes sure to init the operations pointers on the inode before going into the error handling paths. Signed-off-by: Chris Mason Reported-by: Al Viro --- fs/btrfs/inode.c | 176 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 67 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88823f4ca451..214b936bdd3d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5634,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) return ret; } +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -5726,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[1] = name_len + sizeof(*ref); } + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) - goto fail; + goto fail_unlock; inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); @@ -5752,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { @@ -5767,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5782,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_ino(inode), root->root_key.objectid, ret); return inode; + +fail_unlock: + unlock_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; @@ -5916,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); } + out_unlock: btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); @@ -5947,6 +5964,12 @@ out_unlock: iput(inode); } return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -5981,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } drop_inode_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_unlock; - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5998,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - goto out_unlock; + goto out_unlock_inode; - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); d_instantiate(dentry, inode); out_unlock: @@ -6017,6 +6040,11 @@ out_unlock: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6124,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; + goto out_fail_inode; btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail; + goto out_fail_inode; err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail; + goto out_fail_inode; d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); drop_on_err = 0; out_fail: @@ -6152,6 +6185,10 @@ out_fail: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; } /* helper for btfs_get_extent. Given an existing extent in the tree, @@ -8107,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) @@ -8757,12 +8795,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -8771,23 +8803,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; + goto out_unlock_inode; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - drop_inode = 1; - goto out_unlock; + goto out_unlock_inode; } key.objectid = btrfs_ino(inode); key.offset = 0; @@ -8796,9 +8827,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { - drop_inode = 1; btrfs_free_path(path); - goto out_unlock; + goto out_unlock_inode; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -8822,12 +8852,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); - if (err) + if (err) { drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_unlock: - if (!err) - d_instantiate(dentry, inode); btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); @@ -8835,6 +8868,11 @@ out_unlock: } btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -9018,14 +9056,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ret = btrfs_init_inode_security(trans, inode, dir, NULL); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - goto out; - inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -9033,9 +9063,16 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; ret = btrfs_orphan_add(trans, inode); if (ret) - goto out; + goto out_inode; /* * We set number of links to 0 in btrfs_new_inode(), and here we set @@ -9045,6 +9082,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); + unlock_new_inode(inode); d_tmpfile(dentry, inode); mark_inode_dirty(inode); @@ -9054,8 +9092,12 @@ out: iput(inode); btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); - return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + } static const struct inode_operations btrfs_dir_inode_operations = { -- cgit v1.2.2