From 292fd7fc39aa06668f3a8db546714e727120cb3e Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 30 Oct 2012 17:16:16 +0000 Subject: Btrfs: don't allow degraded mount if too many devices are missing The current behavior is to allow mounting or remounting a filesystem writeable in degraded mode if at least one writeable device is present. The next failed write access to a missing device which is above the tolerance of the configured level of redundancy results in an read-only enforcement. Even without this, the next time barrier_all_devices() is called and more devices are missing than tolerable, the switch to read-only mode takes place. In order to behave predictably and to provide proper feedback to the user at mount time, this patch compares the number of missing devices with the number of devices that are tolerated to be missing according to the configured RAID level. If more devices are missing than tolerated, e.g. if two devices are missing in case of RAID1, only a read-only mount and remount is allowed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 915ac14c2064..acd2df85bed5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1226,6 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + if (fs_info->fs_devices->missing_devices > + fs_info->num_tolerated_disk_barrier_failures && + !(*flags & MS_RDONLY)) { + printk(KERN_WARNING + "Btrfs: too many missing devices, writeable remount is not allowed\n"); + ret = -EACCES; + goto restore; + } + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; -- cgit v1.2.2 From aa1b8cd409f05e1489ec77ff219eff6ed4b801b8 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:03:39 +0100 Subject: Btrfs: pass fs_info instead of root A small number of functions that are used in a device replace procedure when the operation is resumed at mount time are unable to pass the same root pointer that would be used in the regular (ioctl) context. And since the root pointer is not required, only the fs_info is, the root pointer argument is replaced with the fs_info pointer argument. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index acd2df85bed5..a1a6c296ddcd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - __btrfs_scrub_cancel(fs_info); + btrfs_scrub_cancel(fs_info); // WARN_ON(1); } } -- cgit v1.2.2 From 1acd6831d98779c88cd57f0a5826d6df0b09f3fa Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 17:11:06 +0100 Subject: Btrfs: avoid risk of a deadlock in btrfs_handle_error Remove the attempt to cancel a running scrub or device replace operation in btrfs_handle_error() because it adds the risk of a deadlock. The only penalty of not canceling the operation is that some I/O remains active until the procedure completes. This is basically the same thing that happens to other tasks that are running in user mode context, they are not affected or stopped in btrfs_handle_error(), these tasks just need to handle write errors correctly. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a1a6c296ddcd..ef2415896b06 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -116,7 +116,16 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { sb->s_flags |= MS_RDONLY; printk(KERN_INFO "btrfs is forced readonly\n"); - btrfs_scrub_cancel(fs_info); + /* + * Note that a running device replace operation is not + * canceled here although there is no way to update + * the progress. It would add the risk of a deadlock, + * therefore the canceling is ommited. The only penalty + * is that some I/O remains active until the procedure + * completes. The next time when the filesystem is + * mounted writeable again, the device replace + * operation continues. + */ // WARN_ON(1); } } -- cgit v1.2.2 From 63a212abc2315972b245f93cb11ae3acf3c0b513 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 5 Nov 2012 18:29:28 +0100 Subject: Btrfs: disallow some operations on the device replace target device This patch adds some code to disallow operations on the device that is used as the target for the device replace operation. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef2415896b06..837ad2d27853 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1354,7 +1354,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) min_stripe_size = BTRFS_STRIPE_LEN; list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->in_fs_metadata || !device->bdev) + if (!device->in_fs_metadata || !device->bdev || + device->is_tgtdev_for_dev_replace) continue; avail_space = device->total_bytes - device->bytes_used; -- cgit v1.2.2 From ff023aac31198e88507d626825379b28ea481d4d Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 11:43:11 +0100 Subject: Btrfs: add code to scrub to copy read data to another disk The device replace procedure makes use of the scrub code. The scrub code is the most efficient code to read the allocated data of a disk, i.e. it reads sequentially in order to avoid disk head movements, it skips unallocated blocks, it uses read ahead mechanisms, and it contains all the code to detect and repair defects. This commit adds code to scrub to allow the scrub code to copy read data to another disk. One goal is to be able to perform as fast as possible. Therefore the write requests are collected until huge bios are built, and the write process is decoupled from the read process with some kind of flow control, of course, in order to limit the allocated memory. The best performance on spinning disks could by reached when the head movements are avoided as much as possible. Therefore a single worker is used to interface the read process with the write process. The regular scrub operation works as fast as before, it is not negatively influenced and actually it is more or less unchanged. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 837ad2d27853..ad4380684b9b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1195,7 +1195,8 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size); btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size); btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size); - btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size); + btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers, + new_pool_size); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) -- cgit v1.2.2 From 8dabb7420f014ab0f9f04afae8ae046c0f48b270 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Tue, 6 Nov 2012 13:15:27 +0100 Subject: Btrfs: change core code of btrfs to support the device replace operations This commit contains all the essential changes to the core code of Btrfs for support of the device replace procedure. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/super.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ad4380684b9b..def4f24b58df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "export.h" #include "compression.h" #include "rcu-string.h" +#include "dev-replace.h" #define CREATE_TRACE_POINTS #include @@ -1225,8 +1226,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) return 0; if (*flags & MS_RDONLY) { + /* + * this also happens on 'umount -rf' or on shutdown, when + * the filesystem is busy. + */ sb->s_flags |= MS_RDONLY; + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + ret = btrfs_commit_super(root); if (ret) goto restore; @@ -1263,6 +1271,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + ret = btrfs_resume_dev_replace_async(fs_info); + if (ret) { + pr_warn("btrfs: failed to resume dev_replace\n"); + goto restore; + } sb->s_flags &= ~MS_RDONLY; } -- cgit v1.2.2 From 9247f3170b2c3d648707c93bbebcd763fac17c06 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 26 Nov 2012 09:24:43 +0000 Subject: Btrfs: use slabs for auto defrag allocation The auto defrag allocation is in the fast path of the IO, so use slabs to improve the speed of the allocation. And besides that, it can do check for leaked objects when the module is removed. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/super.c') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index def4f24b58df..99545df1b86c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1680,10 +1680,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_ordered_data; - err = btrfs_interface_init(); + err = btrfs_auto_defrag_init(); if (err) goto free_delayed_inode; + err = btrfs_interface_init(); + if (err) + goto free_auto_defrag; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -1695,6 +1699,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_auto_defrag: + btrfs_auto_defrag_exit(); free_delayed_inode: btrfs_delayed_inode_exit(); free_ordered_data: @@ -1714,6 +1720,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { btrfs_destroy_cachep(); + btrfs_auto_defrag_exit(); btrfs_delayed_inode_exit(); ordered_data_exit(); extent_map_exit(); -- cgit v1.2.2