From 45ea6095c8f0d6caad5658306416a5d254f1205e Mon Sep 17 00:00:00 2001 From: "slyich@gmail.com" Date: Mon, 7 Nov 2011 16:08:01 -0500 Subject: btrfs: fix double-free 'tree_root' in 'btrfs_mount()' On error path 'tree_root' is treed in 'free_fs_info()'. No need to free it explicitely. Noticed by SLUB in debug mode: Complete reproducer under usermode linux (discovered on real machine): bdev=/dev/ubda btr_root=/btr /mkfs.btrfs $bdev mount $bdev $btr_root mkdir $btr_root/subvols/ cd $btr_root/subvols/ /btrfs su cr foo /btrfs su cr bar mount $bdev -osubvol=subvols/foo $btr_root/subvols/bar umount $btr_root/subvols/bar which gives device fsid 4d55aa28-45b1-474b-b4ec-da912322195e devid 1 transid 7 /dev/ubda ============================================================================= BUG kmalloc-2048: Object already free ----------------------------------------------------------------------------- INFO: Allocated in btrfs_mount+0x389/0x7f0 age=0 cpu=0 pid=277 INFO: Freed in btrfs_mount+0x51c/0x7f0 age=0 cpu=0 pid=277 INFO: Slab 0x0000000062886200 objects=15 used=9 fp=0x0000000070b4d2d0 flags=0x4081 INFO: Object 0x0000000070b4d2d0 @offset=21200 fp=0x0000000070b4a968 ... Call Trace: 70b31948: [<6008c522>] print_trailer+0xe2/0x130 70b31978: [<6008c5aa>] object_err+0x3a/0x50 70b319a8: [<6008e242>] free_debug_processing+0x142/0x2a0 70b319e0: [<600ebf6f>] btrfs_mount+0x55f/0x7f0 70b319f8: [<6008e5c1>] __slab_free+0x221/0x2d0 Signed-off-by: Sergei Trofimovich Cc: Arne Jansen Cc: Chris Mason Cc: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 57080dffdfc6..dcd5aef6b614 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -933,8 +933,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) { + error = -ENOMEM; + goto error_close_devices; + } tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!tree_root) { error = -ENOMEM; goto error_close_devices; } @@ -964,7 +968,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -992,7 +995,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); return ERR_PTR(error); } -- cgit v1.2.2 From 917c16b2b69fc2eeb432eabca73258f08c58361e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Nov 2011 14:49:59 -0500 Subject: Btrfs: fix oops on NULL trans handle in btrfs_truncate If we fail to reserve space in the transaction during truncate, we can error out with a NULL trans handle. The cleanup code needs an extra check to make sure we aren't trying to use the bad handle. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9d0eaa57d4ee..f60e2490bd0d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6529,14 +6529,16 @@ end_trans: ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); -- cgit v1.2.2 From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Nov 2011 15:47:34 -0500 Subject: Btrfs: fix our reservations for updating an inode when completing io People have been reporting ENOSPC crashes in finish_ordered_io. This is because we try to steal from the delalloc block rsv to satisfy a reservation to update the inode. The problem with this is we don't explicitly save space for updating the inode when doing delalloc. This is kind of a problem and we've gotten away with this because way back when we just stole from the delalloc reserve without any questions, and this worked out fine because generally speaking the leaf had been modified either by the mtime update when we did the original write or because we just updated the leaf when we inserted the file extent item, only on rare occasions had the leaf not actually been modified, and that was still ok because we'd just use a block or two out of the over-reservation that is delalloc. Then came the delayed inode stuff. This is amazing, except it wants a full reservation for updating the inode since it may do it at some point down the road after we've written the blocks and we have to recow everything again. This worked out because the delayed inode stuff just stole from the global reserve, that is until recently when I changed that because it caused other problems. So here we are, we're doing everything right and being screwed for it. So take an extra reservation for the inode at delalloc reservation time and carry it through the life of the delalloc reservation. If we need it we can steal it in the delayed inode stuff. If we have already stolen it try and do a normal metadata reservation. If that fails try to steal from the delalloc reservation. If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to solve this and in the meantime we'll steal from the global reserve. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see any problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 4 +-- fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-tree.c | 22 +++++++++++++--- fs/btrfs/inode.c | 1 + 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325a3935..634608d2a6d0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bbe8496d5339..313ee14cf3b7 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - node->bytes_reserved = num_bytes; + if (unlikely(ret)) { + /* This shouldn't happen */ + BUG_ON(release); + return ret; + } + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + node->bytes_reserved = num_bytes; return ret; } @@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 18ea90c8943b..0b044e509e9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - return 0; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents; + return dropped_extents + drop_inode_space; } /** @@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; BTRFS_I(inode)->reserved_extents += nr_extents; + } - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f60e2490bd0d..2b920596c126 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; -- cgit v1.2.2 From a90e8b6fb80db43b029e1e76205452afa8bdc77a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 16:47:55 +0200 Subject: Btrfs: fix memory leak in btrfs_parse_early_options() Don't leak subvol_name string in case multiple subvol= options are given. "The lastest option is effective" behavior (consistent with subvolid= and subvolrootid= options) is preserved. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dcd5aef6b614..6befcaf253bd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: -- cgit v1.2.2 From f23c8af8ca2789eeb0ab9ea90c214f9694d96cc5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 19:15:05 +0200 Subject: Btrfs: fix subvol_name leak on error in btrfs_mount() btrfs_parse_early_options() can fail due to error while scanning devices (-o device= option), but still strdup() subvol_name string: mount -o subvol=SUBV,device=BAD_DEVICE So free subvol_name string on error. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6befcaf253bd..58e9492230ce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -905,8 +905,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } if (subvol_name) { root = mount_subvol(subvol_name, flags, device_name, data); -- cgit v1.2.2 From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 00:08:15 +0200 Subject: Btrfs: avoid null dereference and leaks when bailing from open_ctree() Fix bugs introduced by 6c41761f. Firstly, after failing to allocate any of the tree roots (first 'goto fail' in open_ctree()) we would dereference a NULL fs_info pointer in free_fs_info(). Secondly, after failures from init_srcu_struct(), setup_bdi() and new_inode() we would leak all earlier allocated roots: fs_info fields haven't been initialized yet so free_fs_info() is rendered useless. Fix this by initializing fs_info pointer and fs_info fields before any allocations happen. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e53a5bb85670..91db90b526c2 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); -- cgit v1.2.2 From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 13:26:37 +0200 Subject: Btrfs: close devices on all error paths in open_ctree() Fix a bug introduced by 7e662854 where we would leave devices busy on certain error paths in open_ctree(). fs_info is guaranteed to be non-NULL now so it's safe to dereference it on all error paths. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91db90b526c2..b6a5c0dd0dd8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2460,21 +2460,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; -- cgit v1.2.2 From 04d21a244fdf79d0ac892eaaa9a46b682467277c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 14:41:22 +0200 Subject: Btrfs: rework error handling in btrfs_mount() Commits 6c41761f and 45ea6095 introduced the possibility of NULL pointer dereference on error paths, also we would leave all devices busy and leak fs_info with all sub-structures on error when trying to mount an already mounted fs to a different directory. Fix this by doing all allocations before trying to open any of the devices, adjust error path for mount-already-mounted-fs case. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58e9492230ce..629281c65ff5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -891,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -920,15 +919,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) return ERR_PTR(error); - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - return ERR_PTR(error); - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need @@ -936,28 +926,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) { - error = -ENOMEM; - goto error_close_devices; - } - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -966,7 +964,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + error = -EBUSY; + goto error_close_devices; } btrfs_close_devices(fs_devices); @@ -997,6 +996,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); +error_fs_info: free_fs_info(fs_info); return ERR_PTR(error); } -- cgit v1.2.2 From 2115133f8b9a8dbdb217d14080814df07ce90479 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 10 Nov 2011 20:39:08 -0500 Subject: Btrfs: tweak the delayed inode reservations again Josef sent along an incremental to the inode reservation code to make sure we try and fall back to directly updating the inode item if things go horribly wrong. This reworks that patch slightly, adding a fallback function that will always try to update the inode item directly without going through the delayed_inode code. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 9 +++---- fs/btrfs/inode.c | 64 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 26 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 313ee14cf3b7..6a1a6800776c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -692,11 +692,6 @@ static int btrfs_delayed_inode_reserve_metadata( migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (unlikely(ret)) { - /* This shouldn't happen */ - BUG_ON(release); - return ret; - } out: /* @@ -712,9 +707,11 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ + if (!ret) + node->bytes_reserved = num_bytes; + if (release) btrfs_block_rsv_release(root, src_rsv, num_bytes); - node->bytes_reserved = num_bytes; return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b920596c126..7d394335bcb5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; @@ -2426,7 +2428,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2434,21 +2436,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2476,6 +2463,43 @@ failed: return ret; } +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and @@ -5632,7 +5656,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5670,7 +5694,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, -- cgit v1.2.2 From 924cd8fbe41851eda2b68bf2ed501b2777fd77b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix nocow when deleting the item btrfs_previous_item() just search the b+ tree, do not COW the nodes or leaves, if we modify the result of it, the meta-data will be broken. fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f8e2943101a1..c37433d3cd82 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], -- cgit v1.2.2 From ba38eb4de354d228f2792f93cde2c748a3a3f3b2 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [] warn_slowpath_common+0x80/0x98 [] warn_slowpath_null+0x15/0x17 [] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] __btrfs_cow_block+0x118/0x3b5 [btrfs] [] btrfs_cow_block+0x103/0x14e [btrfs] [] btrfs_search_slot+0x249/0x6a4 [btrfs] [] btrfs_lookup_inode+0x2a/0x8a [btrfs] [] btrfs_update_inode+0xaa/0x141 [btrfs] [] btrfs_save_ino_cache+0xea/0x202 [btrfs] [] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [] commit_fs_roots+0xaa/0x158 [btrfs] [] btrfs_commit_transaction+0x405/0x731 [btrfs] [] ? wake_up_bit+0x25/0x25 [] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [] btrfs_sync_file+0x16a/0x198 [btrfs] [] ? mntput+0x21/0x23 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53dcbdf446cd..f8962a957d65 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -477,11 +494,14 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; -- cgit v1.2.2 From 3254c87618354e58fa2a7b375c6664f567480c33 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix unreleased path in btrfs_orphan_cleanup() When we did stress test for the space relocation, the deadlock happened. By debugging, We found it was caused by the carelessness that we forgot to unlock the read lock of the extent buffers in btrfs_orphan_cleanup() before we end the transaction handle, so the transaction commit task waited the task, which called btrfs_orphan_cleanup(), to unlock the extent buffer, but that task waited the commit task to end the transaction commit, and the deadlock happened. Fix it. Signed-ff-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d394335bcb5..e16215f480d0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2201,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) -- cgit v1.2.2 From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush} btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b044e509e9f..0f47b3e2010e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + int btrfs_block_rsv_add_noflush(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; + return __block_rsv_add(root, block_rsv, num_bytes, 0); } int btrfs_block_rsv_check(struct btrfs_root *root, -- cgit v1.2.2 From 76b9e23d25d5c99f994bee3172de39492e452e93 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix orphan backref nodes If the root node of a fs/file tree is in the block group that is being relocated, but the others are not in the other block groups. when we create a snapshot for this tree between the relocation tree creation ends and ->create_reloc_tree is set to 0, Btrfs will create some backref nodes that are the lowest nodes of the backrefs cache. But we forget to add them into ->leaves list of the backref cache and deal with them, and at last, they will triggered BUG_ON(). kernel BUG at fs/btrfs/relocation.c:239! This patch fixes it by adding them into ->leaves list of backref cache. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 24d654ce7a06..dff29d5e151a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, -- cgit v1.2.2 From 2f120c05e67ae34c93786b1050c6828904314429 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: only map pages if we know we need them when reading the space cache People have been running into a warning when loading space cache because the page is already mapped when trying to read in a bitmap. The way we read in entries and pages is kind of convoluted, so fix it so that io_ctl_read_entry maps the entries if it needs to, and if it hits the end of the page it simply unmaps the page. That way we can unconditionally unmap the io_ctl before reading in the bitmap and we should stop hitting these warnings. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7a15fcfb3e1f..181760f9d2ab 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -537,6 +537,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); @@ -550,10 +557,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, io_ctl_unmap_page(io_ctl); - if (io_ctl->index >= io_ctl->num_pages) - return 0; - - return io_ctl_check_crc(io_ctl, io_ctl->index); + return 0; } static int io_ctl_read_bitmap(struct io_ctl *io_ctl, @@ -561,9 +565,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, { int ret; - if (io_ctl->cur && io_ctl->cur != io_ctl->orig) - io_ctl_unmap_page(io_ctl); - ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; @@ -699,6 +700,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, num_entries--; } + io_ctl_unmap_page(&io_ctl); + /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. -- cgit v1.2.2 From 62f30c5462374b991e7e3f42d49ce2265c1b82f1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix deadlock caused by the race between relocation We can not do flushable reservation for the relocation when we create snapshot, because it may make the transaction commit task and the flush task wait for each other and the deadlock happens. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 960835eaf4da..6a0574e923bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -882,8 +882,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; -- cgit v1.2.2 From 69f4cb526bd02ae5af35846f9a710c099eec3347 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 11 Nov 2011 08:17:10 -0500 Subject: Btrfs: handle bio_add_page failure gracefully in scrub Currently scrub fails with ENOMEM when bio_add_page fails. Unfortunately dm based targets accept only one page per bio, thus making scrub always fails. This patch just submits the current bio when an error is encountered and starts a new one. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 64 +++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed11d3866afd..f4190f22edfb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -1015,12 +985,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -1030,6 +1010,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); -- cgit v1.2.2 From 8965593e41dd2d0e2a2f1e6f245336005ea94a2c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 11 Nov 2011 10:14:57 -0500 Subject: btrfs: rename the option to nospace_cache Rename no_space_cache option to nospace_cache to be more consistent with the rest, where the simple prefix 'no' is used to negate an option. The option has been introduced during the -rc1 cycle and there are has not been widely used, so it's safe. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 629281c65ff5..8bd9d6d0e07a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -197,7 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "no_space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -711,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); else - seq_puts(seq, ",no_space_cache"); + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) -- cgit v1.2.2 From f1ebcc74d5b2159f44c96b479b6eb8afc7829095 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Nov 2011 20:48:06 -0500 Subject: Btrfs: fix tree corruption after multi-thread snapshots and inode_cache flush The btrfs snapshotting code requires that once a root has been snapshotted, we don't change it during a commit. But there are two cases to lead to tree corruptions: 1) multi-thread snapshots can commit serveral snapshots in a transaction, and this may change the src root when processing the following pending snapshots, which lead to the former snapshots corruptions; 2) the free inode cache was changing the roots when it root the cache, which lead to corruptions. This fixes things by making sure we force COW the block after we create a snapshot during commiting a transaction, then any changes to the roots will result in COW, and we get all the fs roots and snapshot roots to be consistent. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 17 ++++++++++++++++- fs/btrfs/ctree.h | 2 ++ fs/btrfs/transaction.c | 8 ++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e4ea38..dede441bdeee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59ff9292..b1cb3c052484 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1271,6 +1271,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6a0574e923bc..81376d94cd3c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; -- cgit v1.2.2 From 8d514bbf37eecf0a3e309284728637816a36764b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 16:06:09 -0500 Subject: btrfs: fix double mntput() in mount_subvol() Signed-off-by: Al Viro --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8bd9d6d0e07a..969a7747e889 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -861,7 +861,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (!is_subvolume_inode(path.dentry->d_inode)) { path_put(&path); - mntput(mnt); error = -EINVAL; printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); -- cgit v1.2.2 From c13344958780b4046305ee6235d686c846535529 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 16:12:14 -0500 Subject: switch create_mnt_ns() to saner calling conventions, fix double mntput() in nfs Life is much saner if create_mnt_ns(mnt) drops mnt in case of error... Switch it to such calling conventions, switch callers, fix double mntput() in fs/nfs/super.c one. Signed-off-by: Al Viro --- fs/btrfs/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 969a7747e889..cfbedd7755b0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -843,10 +843,8 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_CAST(mnt); ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) { - mntput(mnt); + if (IS_ERR(ns_private)) return ERR_CAST(ns_private); - } /* * This will trigger the automount of the subvol so we can just -- cgit v1.2.2 From ea441d1104cf1efb471fa81bc91e9fd1e6ae29fd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 21:43:59 -0500 Subject: new helper: mount_subtree() takes vfsmount and relative path, does lookup within that vfsmount (possibly triggering automounts) and returns the result as root of subtree suitable for return by ->mount() (i.e. a reference to dentry and an active reference to its superblock grabbed, superblock locked exclusive). btrfs and nfs switched to it instead of open-coding the sucker. Signed-off-by: Al Viro --- fs/btrfs/super.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cfbedd7755b0..17ee7fc5e64e 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -825,13 +825,9 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { - struct super_block *s; struct dentry *root; struct vfsmount *mnt; - struct mnt_namespace *ns_private; char *newargs; - struct path path; - int error; newargs = setup_root_args(data); if (!newargs) @@ -842,36 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (IS_ERR(mnt)) return ERR_CAST(mnt); - ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) - return ERR_CAST(ns_private); - - /* - * This will trigger the automount of the subvol so we can just - * drop the mnt we have here and return the dentry that we - * found. - */ - error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, - LOOKUP_FOLLOW, &path); - put_mnt_ns(ns_private); - if (error) - return ERR_PTR(error); + root = mount_subtree(mnt, subvol_name); - if (!is_subvolume_inode(path.dentry->d_inode)) { - path_put(&path); - error = -EINVAL; + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); - return ERR_PTR(-EINVAL); } - /* Get a ref to the sb and the dentry we found and return it */ - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - root = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); - return root; } -- cgit v1.2.2 From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Nov 2011 15:07:51 -0500 Subject: Btrfs: fix barrier flushes When btrfs is writing the super blocks, it send barrier flushes to make sure writeback caching drives get all the metadata on disk in the right order. But, we have two bugs in the way these are sent down. When doing full commits (not via the tree log), we are sending the barrier down before the last super when it should be going down before the first. In multi-device setups, we should be waiting for the barriers to complete on all devices before writing any of the supers. Both of these bugs can cause corruptions on power failures. We fix it with some new code to send down empty barriers to all devices before writing the first super. Alexandre Oliva found the multi-device bug. Arne Jansen did the async barrier loop. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva --- fs/btrfs/disk-io.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.h | 6 +++ 2 files changed, 134 insertions(+), 17 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b6a5c0dd0dd8..48d30138237f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c49f352..78f2d4d4f37f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device { struct reada_zone *reada_curr_zone; struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { -- cgit v1.2.2 From 745c4d8e160afaf6c75e887c27ea4b75c8142b26 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sun, 20 Nov 2011 07:31:57 -0500 Subject: btrfs: Fix up 32/64-bit compatibility for new ioctls This patch casts to unsigned long before casting to a pointer and fixes the following warnings: fs/btrfs/extent_io.c:2289:20: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2933:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2937:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/ioctl.c:3020:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/scrub.c:275:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/backref.c:686:27: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/ioctl.c | 9 ++++++--- fs/btrfs/scrub.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad3929c..22c64fff1bd5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->val[i] = (u64)fspath; + ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d0e7a0..47fdba7853c3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2286,7 +2286,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } if (!uptodate) { u64 failed_mirror; - failed_mirror = (u64)bio->bi_bdev; + failed_mirror = (unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c472f126..bda53fcd1a09 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, if (ret < 0) goto out; - ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); if (ret) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4190f22edfb..fab420db5121 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - (char *)ipath->fspath->val[i]); + (char *)(unsigned long)ipath->fspath->val[i]); free_ipath(ipath); return 0; -- cgit v1.2.2 From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: mirror_num should be int, not u64 My previous patch introduced some u64 for failed_mirror variables, this one makes it consistent again. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48d30138237f..94abc25392f6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47fdba7853c3..a877b8b212eb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) clean_io_failure(start, page); } if (!uptodate) { - u64 failed_mirror; - failed_mirror = (unsigned long)bio->bi_bdev; + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0e23bc..7604c3001322 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, u64 failed_mirror, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, -- cgit v1.2.2 From 0f0fbf1d0e188d129756e9508090af4bdbfde00b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: fix to search one more bitmap for cluster setup Suppose there are two bitmaps [0, 256], [256, 512] and one extent [100, 120] in the free space cache, and we want to setup a cluster with offset=100, bytes=50. In this case, there will be only one bitmap [256, 512] in the temporary bitmaps list, and then setup_cluster_bitmap() won't search bitmap [0, 256]. The cause is, the list is constructed in setup_cluster_no_bitmap(), and only bitmaps with bitmap_entry->offset >= offset will be added into the list, and the very bitmap that convers offset has bitmap_entry->offset <= offset. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 181760f9d2ab..8f792f41feab 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2453,10 +2453,22 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry; struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; + /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + /* * First check our cached list of bitmaps and see if there is an entry * here that will work. -- cgit v1.2.2 From 52621cb6ed0e0e14358bb317bda7cd5fbd5c2a27 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: avoid unnecessary bitmap search for cluster setup setup_cluster_no_bitmap() searches all the extents and bitmaps starting from offset. Therefore if it returns -ENOSPC, all the bitmaps starting from offset are in the bitmaps list, so it's sufficient to search from this list in setup_cluser_bitmap(). Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 42 ++++-------------------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8f792f41feab..8c32434da2cb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2451,7 +2451,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); @@ -2469,10 +2468,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, list_add(&entry->list, bitmaps); } - /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. - */ list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2483,38 +2478,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2532,8 +2499,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2572,7 +2539,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) -- cgit v1.2.2 From fadc0d8be4dfca80f6c568bc5874931893c6709b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: fix stat blocks accounting Round inode bytes and delalloc bytes up to real blocksize before converting to sector size. Otherwise eg. files smaller than 512 are reported with zero blocks due to incorrect rounding. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e16215f480d0..8ad26b135a1c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6794,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } -- cgit v1.2.2 From 5bb1468238e20b15921909e9f9601e945f03bac7 Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: prefix resize related printks with btrfs: For the user it is confusing to find something like: [10197.627710] new size for /dev/mapper/vg0-usr_share is 3221225472 in kernel log, because it doesn't point directly to btrfs. This patch prefixes those messages with "btrfs:" like other btrfs related printks. Signed-off-by: Arnd Hannemann Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bda53fcd1a09..a90e749ed6d2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { -- cgit v1.2.2 From 291c7d2f577428f896daa5002e784959328a80aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2011 13:52:14 -0500 Subject: Btrfs: wait on caching if we're loading the free space cache We've been hitting panics when running xfstest 13 in a loop for long periods of time. And actually this problem has always existed so we've been hitting these things randomly for a while. Basically what happens is we get a thread coming into the allocator and reading the space cache off of disk and adding the entries to the free space cache as we go. Then we get another thread that comes in and tries to allocate from that block group. Since block_group->cached != BTRFS_CACHE_NO it goes ahead and tries to do the allocation. We do this because if we're doing the old slow way of caching we don't want to hold people up and wait for everything to finish. The problem with this is we could end up discarding the space cache at some arbitrary point in the future, which means we could very well end up allocating space that is either bad, or when the real caching happens it could end up thinking the space isn't in use when it really is and cause all sorts of other problems. The solution is to add a new flag to indicate we are loading the free space cache from disk, and always try to cache the block group if cache->cached != BTRFS_CACHE_FINISHED. That way if we are loading the space cache anybody else who tries to allocate from the block group will have to wait until it's finished to make sure it completes successfully. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +- fs/btrfs/extent-tree.c | 119 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 81 insertions(+), 41 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1cb3c052484..04a5dfcee5a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f47b3e2010e..5d86877f10e1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root) && btrfs_test_opt(root, SPACE_CACHE)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -5177,13 +5218,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5205,7 +5248,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5215,10 +5257,7 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; -- cgit v1.2.2 From f7d61dcd6873c49bcc42be2caa2af1c2511aa915 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2011 09:31:24 -0500 Subject: Btrfs: clear pages dirty for io and set them extent mapped When doing the io_ctl helpers to clean up the free space cache stuff I stopped using our normal prepare_pages stuff, which means I of course forgot to do things like set the pages extent mapped, which will cause us all sorts of wonderful propblems. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8c32434da2cb..aedacdbf77e2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, } } + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + return 0; } -- cgit v1.2.2 From 4d479cf010d56ec9c54f3099992d039918f1296b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Nov 2011 11:34:31 -0500 Subject: Btrfs: sectorsize align offsets in fiemap We've been hitting BUG()'s in btrfs_cont_expand and btrfs_fallocate and anywhere else that calls btrfs_get_extent while running xfstests 13 in a loop. This is because fiemap is calling btrfs_get_extent with non-sectorsize aligned offsets, which will end up adding mappings that are not sectorsize aligned, which will cause problems in some cases for subsequent calls to btrfs_get_extent for similar areas that are sectorsize aligned. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't hit the problem that I could previously hit in at most 20 minutes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a877b8b212eb..9472d3de5e52 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; -- cgit v1.2.2 From 24a70313969fc3fc440216b40babdb42564acff3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 21 Nov 2011 09:39:11 -0500 Subject: Btrfs: remove free-space-cache.c WARN during log replay The log replay code only partially loads block groups, since the block group caching code is able to detect and deal with extents the logging code has pinned down. While the logging code is pinning down block groups, there is a bogus WARN_ON we're hitting if the code wasn't able to find an extent in the cache. This commit removes the warning because it can happen any time there isn't a valid free space cache for that block group. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index aedacdbf77e2..6e5b7e463698 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1849,7 +1849,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } -- cgit v1.2.2 From 26bdef541d26fd6a5ddffdf8949ace22f94f809f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Nov 2011 11:28:01 +0300 Subject: btrfs scrub: handle -ENOMEM from init_ipath() init_ipath() can return an ERR_PTR(-ENOMEM). Signed-off-by: Dan Carpenter --- fs/btrfs/scrub.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fab420db5121..c27bcb67f330 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -256,6 +256,11 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) btrfs_release_path(swarn->path); ipath = init_ipath(4096, local_root, swarn->path); + if (IS_ERR(ipath)) { + ret = PTR_ERR(ipath); + ipath = NULL; + goto err; + } ret = paths_from_inode(inum, ipath); if (ret < 0) -- cgit v1.2.2 From aa38a711a893accf5b5192f3d705a120deaa81e0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 18 Nov 2011 17:43:00 +0800 Subject: Btrfs: fix deadlock on metadata reservation when evicting a inode When I ran the xfstests, I found the test tasks was blocked on meta-data reservation. By debugging, I found the reason of this bug: start transaction | v reserve meta-data space | v flush delay allocation -> iput inode -> evict inode ^ | | v wait for delay allocation flush <- reserve meta-data space And besides that, the flush on evicting inode will block the thread, which is reclaiming the memory, and make oom happen easily. Fix this bug by skipping the flush step when evicting inode. Signed-off-by: Miao Xie --- fs/btrfs/ctree.h | 3 +++ fs/btrfs/extent-tree.c | 22 ++++++++++++++++++---- fs/btrfs/inode.c | 2 +- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 04a5dfcee5a1..50634abef9b4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2369,6 +2369,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved); +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved); int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5d86877f10e1..b7e5f6898d07 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3887,9 +3887,9 @@ int btrfs_block_rsv_check(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_refill(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 min_reserved) +static inline int __btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int flush) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -3908,7 +3908,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, if (!ret) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3917,6 +3917,20 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_refill(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1); +} + +int btrfs_block_rsv_refill_noflush(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved) +{ + return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0); +} + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8ad26b135a1c..c5ccec23984c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3490,7 +3490,7 @@ void btrfs_evict_inode(struct inode *inode) * doing the truncate. */ while (1) { - ret = btrfs_block_rsv_refill(root, rsv, min_size); + ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size); /* * Try and steal from the global reserve since we will -- cgit v1.2.2 From ece7d20e8be6730fbb29f4550de6b19b1a3a9387 Mon Sep 17 00:00:00 2001 From: Mike Fleetwood Date: Fri, 18 Nov 2011 18:55:01 +0000 Subject: Btrfs: Don't error on resizing FS to same size It seems overly harsh to fail a resize of a btrfs file system to the same size when a shrink or grow would succeed. User app GParted trips over this error. Allow it by bypassing the shrink or grow operation. Signed-off-by: Mike Fleetwood --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a90e749ed6d2..72d461656f60 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1278,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, } ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); - } else { + } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } -- cgit v1.2.2 From b772a86ea6d932ac29d5e50e67c977653c832f8a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 28 Nov 2011 16:43:00 +0800 Subject: Btrfs: fix oops when calling statfs on readonly device To reproduce this bug: # dd if=/dev/zero of=img bs=1M count=256 # mkfs.btrfs img # losetup -r /dev/loop1 img # mount /dev/loop1 /mnt OOPS!! It triggered BUG_ON(!nr_devices) in btrfs_calc_avail_data_space(). To fix this, instead of checking write-only devices, we check all open deivces: # df -h /dev/loop1 Filesystem Size Used Avail Use% Mounted on /dev/loop1 250M 28K 238M 1% /mnt Signed-off-by: Li Zefan --- fs/btrfs/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8bd9d6d0e07a..1a3ce9e0b495 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1083,7 +1083,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) int i = 0, nr_devices; int ret; - nr_devices = fs_info->fs_devices->rw_devices; + nr_devices = fs_info->fs_devices->open_devices; BUG_ON(!nr_devices); devices_info = kmalloc(sizeof(*devices_info) * nr_devices, @@ -1105,8 +1105,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) else min_stripe_size = BTRFS_STRIPE_LEN; - list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { - if (!device->in_fs_metadata) + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->in_fs_metadata || !device->bdev) continue; avail_space = device->total_bytes - device->bytes_used; -- cgit v1.2.2 From f2d0f6765d6332f9be732965a0c6f3b8a55082b4 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Mon, 28 Nov 2011 12:04:43 -0200 Subject: Btrfs: initialize new bitmaps' list We're failing to create clusters with bitmaps because setup_cluster_no_bitmap checks that the list is empty before inserting the bitmap entry in the list for setup_cluster_bitmap, but the list field is only initialized when it is restored from the on-disk free space cache, or when it is written out to disk. Besides a potential race condition due to the multiple use of the list field, filesystem performance severely degrades over time: as we use up all non-bitmap free extents, the try-to-set-up-cluster dance is done at every metadata block allocation. For every block group, we fail to set up a cluster, and after failing on them all up to twice, we fall back to the much slower unclustered allocation. To make matters worse, before the unclustered allocation, we try to create new block groups until we reach the 1% threshold, which introduces additional bitmaps and thus block groups that we'll iterate over at each metadata block request. --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6e5b7e463698..ff179b1e7423 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1470,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; -- cgit v1.2.2 From b78d09bceb524ee6481c21b77bda22d766b10e6a Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 30 Nov 2011 13:43:00 -0500 Subject: Btrfs: reset cluster's max_size when creating bitmap The field that indicates the size of the largest contiguous chunk of free space in the cluster is not initialized when setting up bitmaps, it's only increased when we find a larger contiguous chunk. We end up retaining a larger value than appropriate for highly-fragmented clusters, which may cause pointless searches for large contiguous groups, and even cause clusters that do not meet the density requirements to be set up. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/btrfs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ff179b1e7423..ec23d43d0c35 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2320,6 +2320,7 @@ again: if (!found) { start = i; + cluster->max_size = 0; found = true; } -- cgit v1.2.2 From 1b22bad779be7fe07242be04749ec969164528b8 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 30 Nov 2011 13:43:00 -0500 Subject: Btrfs: start search for new cluster at the beginning Instead of starting at zero (offset is always zero), request a cluster starting at search_start, that denotes the beginning of the current block group. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b7e5f6898d07..97c12067a4b0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5301,10 +5301,8 @@ alloc: spin_lock(&last_ptr->refill_lock); if (last_ptr->block_group && (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) { - offset = 0; + !block_group_bits(last_ptr->block_group, data))) goto refill_cluster; - } offset = btrfs_alloc_from_cluster(block_group, last_ptr, num_bytes, search_start); @@ -5355,7 +5353,7 @@ refill_cluster: /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, - offset, num_bytes, + search_start, num_bytes, empty_cluster + empty_size); if (ret == 0) { /* -- cgit v1.2.2 From 425d83156ca27f74e2cc3f370138038c3c8947f8 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 30 Nov 2011 13:43:00 -0500 Subject: Btrfs: skip block groups without enough space for a cluster We test whether a block group has enough free space to hold the requested block, but when we're doing clustered allocation, we can save some cycles by testing whether it has enough room for the cluster upfront, otherwise we end up attempting to set up a cluster and failing. Only in the NO_EMPTY_SIZE loop do we attempt an unclustered allocation, and by then we'll have zeroed the cluster size, so this patch won't stop us from using the block group as a last resort. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 97c12067a4b0..71c8e7049d0c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5278,7 +5278,7 @@ alloc: spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < - num_bytes + empty_size) { + num_bytes + empty_cluster + empty_size) { spin_unlock(&block_group->free_space_ctl->tree_lock); goto loop; } -- cgit v1.2.2 From be064d113906f04ea13088a8260e1e68ae0a4050 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 30 Nov 2011 13:43:00 -0500 Subject: Btrfs: skip allocation attempt from empty cluster If we don't have a cluster, don't bother trying to allocate from it, jumping right away to the attempt to allocate a new cluster. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71c8e7049d0c..813c6bb96c9a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5299,9 +5299,9 @@ alloc: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (last_ptr->block_group && - (last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data))) + if (!last_ptr->block_group || + last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data)) goto refill_cluster; offset = btrfs_alloc_from_cluster(block_group, last_ptr, -- cgit v1.2.2 From f4a8e6563ea5366f563cb741a27fe90c5fa7f0fc Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Thu, 1 Dec 2011 09:30:36 -0500 Subject: Btrfs: fix meta data raid-repair merge problem Commit 4a54c8c16 introduced raid-repair, killing the individual readpage_io_failed_hook entries from inode.c and disk-io.c. Commit 4bb31e92 introduced new readahead code, adding a readpage_io_failed_hook to disk-io.c. The raid-repair commit had logic to disable raid-repair, if readpage_io_failed_hook is set. Thus, the readahead commit effectively disabled raid-repair for meta data. This commit changes the logic to always attempt raid-repair when needed and call the readpage_io_failed_hook in case raid-repair fails. This is much more straight forward and should have been like that from the beginning. Signed-off-by: Jan Schmidt Reported-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9472d3de5e52..be1bf627a14b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2287,14 +2287,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (!uptodate) { int failed_mirror; failed_mirror = (int)(unsigned long)bio->bi_bdev; - if (tree->ops && tree->ops->readpage_io_failed_hook) - ret = tree->ops->readpage_io_failed_hook( - bio, page, start, end, - failed_mirror, state); - else - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + /* + * The generic bio_readpage_error handles errors the + * following way: If possible, new read requests are + * created and submitted and will end up in + * end_bio_extent_readpage as well (if we're lucky, not + * in the !uptodate case). In that case it returns 0 and + * we just go on with the next page in our bio. If it + * can't handle the error it will return -EIO and we + * remain responsible for that page. + */ + ret = bio_readpage_error(bio, page, start, end, + failed_mirror, NULL); if (ret == 0) { +error_handled: uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); if (err) @@ -2302,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err) uncache_state(&cached); continue; } + if (tree->ops && tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook( + bio, page, start, end, + failed_mirror, state); + if (ret == 0) + goto error_handled; + } } if (uptodate) { -- cgit v1.2.2 From 062c05c46bd4358aad7a0e0cb5ffeb98ab935286 Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 7 Dec 2011 19:50:42 -0500 Subject: Btrfs: try to allocate from cluster even at LOOP_NO_EMPTY_SIZE If we reach LOOP_NO_EMPTY_SIZE, we won't even try to use a cluster that others might have set up. Odds are that there won't be one, but if someone else succeeded in setting it up, we might as well use it, even if we don't try to set up a cluster again. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 813c6bb96c9a..db0b23b14f20 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5285,15 +5285,10 @@ alloc: spin_unlock(&block_group->free_space_ctl->tree_lock); /* - * Ok we want to try and use the cluster allocator, so lets look - * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will - * have tried the cluster allocator plenty of times at this - * point and not have found anything, so we are likely way too - * fragmented for the clustering stuff to find anything, so lets - * just skip it and let the allocator find whatever block it can - * find + * Ok we want to try and use the cluster allocator, so + * lets look there */ - if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) { + if (last_ptr) { /* * the refill lock keeps out other * people trying to start a new cluster @@ -5342,6 +5337,20 @@ alloc: } spin_unlock(&last_ptr->lock); refill_cluster: + /* If we are on LOOP_NO_EMPTY_SIZE, we can't + * set up a new clusters, so lets just skip it + * and let the allocator find whatever block + * it can find. If we reach this point, we + * will have tried the cluster allocator + * plenty of times and not have found + * anything, so we are likely way too + * fragmented for the clustering stuff to find + * anything. */ + if (loop >= LOOP_NO_EMPTY_SIZE) { + spin_unlock(&last_ptr->refill_lock); + goto unclustered_alloc; + } + /* * this cluster didn't work out, free it and * start over @@ -5389,6 +5398,7 @@ refill_cluster: goto loop; } +unclustered_alloc: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); /* -- cgit v1.2.2 From 274bd4fb3ed6b72c1d77ef8850511f09fc6b8e4d Mon Sep 17 00:00:00 2001 From: Alexandre Oliva Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: try cluster but don't advance in search list When we find an existing cluster, we switch to its block group as the current block group, possibly skipping multiple blocks in the process. Furthermore, under heavy contention, multiple threads may fail to allocate from a cluster and then release just-created clusters just to proceed to create new ones in a different block group. This patch tries to allocate from an existing cluster regardless of its block group, and doesn't switch to that group, instead proceeding to try to allocate a cluster from the group it was iterating before the attempt. Signed-off-by: Alexandre Oliva Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 74 +++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 43 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index db0b23b14f20..05e1386b8bec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5106,11 +5106,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; + struct btrfs_block_group_cache *used_block_group; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; int done_chunk_alloc = 0; struct btrfs_space_info *space_info; - int last_ptr_loop = 0; int loop = 0; int index = 0; int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? @@ -5172,6 +5172,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, ideal_cache: block_group = btrfs_lookup_block_group(root->fs_info, search_start); + used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -5209,6 +5210,7 @@ search: u64 offset; int cached; + used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -5294,49 +5296,33 @@ alloc: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); - if (!last_ptr->block_group || - last_ptr->block_group->ro || - !block_group_bits(last_ptr->block_group, data)) + used_block_group = last_ptr->block_group; + if (used_block_group != block_group && + (!used_block_group || + used_block_group->ro || + !block_group_bits(used_block_group, data))) { + used_block_group = block_group; goto refill_cluster; + } + + if (used_block_group != block_group) + btrfs_get_block_group(used_block_group); - offset = btrfs_alloc_from_cluster(block_group, last_ptr, - num_bytes, search_start); + offset = btrfs_alloc_from_cluster(used_block_group, + last_ptr, num_bytes, used_block_group->key.objectid); if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); goto checks; } - spin_lock(&last_ptr->lock); - /* - * whoops, this cluster doesn't actually point to - * this block group. Get a ref on the block - * group is does point to and try again - */ - if (!last_ptr_loop && last_ptr->block_group && - last_ptr->block_group != block_group && - index <= - get_block_group_index(last_ptr->block_group)) { - - btrfs_put_block_group(block_group); - block_group = last_ptr->block_group; - btrfs_get_block_group(block_group); - spin_unlock(&last_ptr->lock); - spin_unlock(&last_ptr->refill_lock); - - last_ptr_loop = 1; - search_start = block_group->key.objectid; - /* - * we know this block group is properly - * in the list because - * btrfs_remove_block_group, drops the - * cluster before it removes the block - * group from the list - */ - goto have_block_group; + WARN_ON(last_ptr->block_group != used_block_group); + if (used_block_group != block_group) { + btrfs_put_block_group(used_block_group); + used_block_group = block_group; } - spin_unlock(&last_ptr->lock); refill_cluster: + BUG_ON(used_block_group != block_group); /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -5357,8 +5343,6 @@ refill_cluster: */ btrfs_return_cluster_to_free_space(NULL, last_ptr); - last_ptr_loop = 0; - /* allocate a cluster in this block group */ ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, @@ -5425,14 +5409,14 @@ checks: search_start = stripe_align(root, offset); /* move on to the next group */ if (search_start + num_bytes >= search_end) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } /* move on to the next group */ if (search_start + num_bytes > - block_group->key.objectid + block_group->key.offset) { - btrfs_add_free_space(block_group, offset, num_bytes); + used_block_group->key.objectid + used_block_group->key.offset) { + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5440,14 +5424,14 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(block_group, num_bytes, + ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(block_group, offset, num_bytes); + btrfs_add_free_space(used_block_group, offset, num_bytes); goto loop; } @@ -5456,15 +5440,19 @@ checks: ins->offset = num_bytes; if (offset < search_start) - btrfs_add_free_space(block_group, offset, + btrfs_add_free_space(used_block_group, offset, search_start - offset); BUG_ON(offset > search_start); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); + if (used_block_group != block_group) + btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.2.2 From a5d16333612718569ffd26064270e535cb9c3928 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: check if the to-be-added device is writable If we call ioctl(BTRFS_IOC_ADD_DEV) directly, we'll succeed in adding a readonly device to a btrfs filesystem, and btrfs will write to that device, emitting kernel errors: [ 3109.833692] lost page write due to I/O error on loop2 [ 3109.833720] lost page write due to I/O error on loop2 ... Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c37433d3cd82..0a8c8f8304b1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1611,7 +1611,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); -- cgit v1.2.2 From 1cf4ffdb3289624a6462c94f2ce05545b32ef736 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Wed, 7 Dec 2011 20:08:40 -0500 Subject: Btrfs: drop spin lock when memory alloc fails Drop spin lock in convert_extent_bit() when memory alloc fails, otherwise, it will be a deadlock. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs/btrfs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index be1bf627a14b..49f3c9dc09f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -935,8 +935,10 @@ again: node = tree_search(tree, start); if (!node) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); @@ -992,8 +994,10 @@ hit_next: */ if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, start); BUG_ON(err == -EEXIST); prealloc = NULL; @@ -1024,8 +1028,10 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } /* * Avoid to free 'prealloc' if it can be merged with @@ -1051,8 +1057,10 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - if (!prealloc) - return -ENOMEM; + if (!prealloc) { + err = -ENOMEM; + goto out; + } err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); -- cgit v1.2.2