aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorStefan Behrens <sbehrens@giantdisaster.de>2012-08-01 12:56:49 -0400
committerChris Mason <chris.mason@fusionio.com>2012-10-09 09:20:19 -0400
commit5af3e8cce8b7ba0a2819e18c9146c8c0b452d479 (patch)
tree15c2b4859ea041c09f027ac44de506ac0ffcafa4
parent62856a9b73860cffe2a3d91b069393b88c219aa6 (diff)
Btrfs: make filesystem read-only when submitting barrier fails
So far the return code of barrier_all_devices() is ignored, which means that errors are ignored. The result can be a corrupt filesystem which is not consistent. This commit adds code to evaluate the return code of barrier_all_devices(). The normal btrfs_error() mechanism is used to switch the filesystem into read-only mode when errors are detected. In order to decide whether barrier_all_devices() should return error or success, the number of disks that are allowed to fail the barrier submission is calculated. This calculation accounts for the worst RAID level of metadata, system and data. If single, dup or RAID0 is in use, a single disk error is already considered to be fatal. Otherwise a single disk error is tolerated. The calculation of the number of disks that are tolerated to fail the barrier operation is performed when the filesystem gets mounted, when a balance operation is started and finished, and when devices are added or removed. Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/disk-io.c109
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/ioctl.c8
-rw-r--r--fs/btrfs/tree-log.c7
-rw-r--r--fs/btrfs/volumes.c30
6 files changed, 142 insertions, 19 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50dcd0fbae11..1630be831210 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1468,6 +1468,8 @@ struct btrfs_fs_info {
1468 1468
1469 /* next backup root to be overwritten */ 1469 /* next backup root to be overwritten */
1470 int backup_root_index; 1470 int backup_root_index;
1471
1472 int num_tolerated_disk_barrier_failures;
1471}; 1473};
1472 1474
1473/* 1475/*
@@ -3361,6 +3363,9 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
3361int btrfs_defrag_file(struct inode *inode, struct file *file, 3363int btrfs_defrag_file(struct inode *inode, struct file *file,
3362 struct btrfs_ioctl_defrag_range_args *range, 3364 struct btrfs_ioctl_defrag_range_args *range,
3363 u64 newer_than, unsigned long max_pages); 3365 u64 newer_than, unsigned long max_pages);
3366void btrfs_get_block_group_info(struct list_head *groups_list,
3367 struct btrfs_ioctl_space_info *space);
3368
3364/* file.c */ 3369/* file.c */
3365int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 3370int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
3366 struct inode *inode); 3371 struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index c69995556f61..835523687707 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2505,6 +2505,8 @@ retry_root_backup:
2505 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2505 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
2506 goto fail_block_groups; 2506 goto fail_block_groups;
2507 } 2507 }
2508 fs_info->num_tolerated_disk_barrier_failures =
2509 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2508 2510
2509 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 2511 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
2510 "btrfs-cleaner"); 2512 "btrfs-cleaner");
@@ -2888,12 +2890,10 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2888 printk_in_rcu("btrfs: disabling barriers on dev %s\n", 2890 printk_in_rcu("btrfs: disabling barriers on dev %s\n",
2889 rcu_str_deref(device->name)); 2891 rcu_str_deref(device->name));
2890 device->nobarriers = 1; 2892 device->nobarriers = 1;
2891 } 2893 } else if (!bio_flagged(bio, BIO_UPTODATE)) {
2892 if (!bio_flagged(bio, BIO_UPTODATE)) {
2893 ret = -EIO; 2894 ret = -EIO;
2894 if (!bio_flagged(bio, BIO_EOPNOTSUPP)) 2895 btrfs_dev_stat_inc_and_print(device,
2895 btrfs_dev_stat_inc_and_print(device, 2896 BTRFS_DEV_STAT_FLUSH_ERRS);
2896 BTRFS_DEV_STAT_FLUSH_ERRS);
2897 } 2897 }
2898 2898
2899 /* drop the reference from the wait == 0 run */ 2899 /* drop the reference from the wait == 0 run */
@@ -2932,14 +2932,15 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2932{ 2932{
2933 struct list_head *head; 2933 struct list_head *head;
2934 struct btrfs_device *dev; 2934 struct btrfs_device *dev;
2935 int errors = 0; 2935 int errors_send = 0;
2936 int errors_wait = 0;
2936 int ret; 2937 int ret;
2937 2938
2938 /* send down all the barriers */ 2939 /* send down all the barriers */
2939 head = &info->fs_devices->devices; 2940 head = &info->fs_devices->devices;
2940 list_for_each_entry_rcu(dev, head, dev_list) { 2941 list_for_each_entry_rcu(dev, head, dev_list) {
2941 if (!dev->bdev) { 2942 if (!dev->bdev) {
2942 errors++; 2943 errors_send++;
2943 continue; 2944 continue;
2944 } 2945 }
2945 if (!dev->in_fs_metadata || !dev->writeable) 2946 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2947,13 +2948,13 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2947 2948
2948 ret = write_dev_flush(dev, 0); 2949 ret = write_dev_flush(dev, 0);
2949 if (ret) 2950 if (ret)
2950 errors++; 2951 errors_send++;
2951 } 2952 }
2952 2953
2953 /* wait for all the barriers */ 2954 /* wait for all the barriers */
2954 list_for_each_entry_rcu(dev, head, dev_list) { 2955 list_for_each_entry_rcu(dev, head, dev_list) {
2955 if (!dev->bdev) { 2956 if (!dev->bdev) {
2956 errors++; 2957 errors_wait++;
2957 continue; 2958 continue;
2958 } 2959 }
2959 if (!dev->in_fs_metadata || !dev->writeable) 2960 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2961,13 +2962,87 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
2961 2962
2962 ret = write_dev_flush(dev, 1); 2963 ret = write_dev_flush(dev, 1);
2963 if (ret) 2964 if (ret)
2964 errors++; 2965 errors_wait++;
2965 } 2966 }
2966 if (errors) 2967 if (errors_send > info->num_tolerated_disk_barrier_failures ||
2968 errors_wait > info->num_tolerated_disk_barrier_failures)
2967 return -EIO; 2969 return -EIO;
2968 return 0; 2970 return 0;
2969} 2971}
2970 2972
2973int btrfs_calc_num_tolerated_disk_barrier_failures(
2974 struct btrfs_fs_info *fs_info)
2975{
2976 struct btrfs_ioctl_space_info space;
2977 struct btrfs_space_info *sinfo;
2978 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2979 BTRFS_BLOCK_GROUP_SYSTEM,
2980 BTRFS_BLOCK_GROUP_METADATA,
2981 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2982 int num_types = 4;
2983 int i;
2984 int c;
2985 int num_tolerated_disk_barrier_failures =
2986 (int)fs_info->fs_devices->num_devices;
2987
2988 for (i = 0; i < num_types; i++) {
2989 struct btrfs_space_info *tmp;
2990
2991 sinfo = NULL;
2992 rcu_read_lock();
2993 list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
2994 if (tmp->flags == types[i]) {
2995 sinfo = tmp;
2996 break;
2997 }
2998 }
2999 rcu_read_unlock();
3000
3001 if (!sinfo)
3002 continue;
3003
3004 down_read(&sinfo->groups_sem);
3005 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3006 if (!list_empty(&sinfo->block_groups[c])) {
3007 u64 flags;
3008
3009 btrfs_get_block_group_info(
3010 &sinfo->block_groups[c], &space);
3011 if (space.total_bytes == 0 ||
3012 space.used_bytes == 0)
3013 continue;
3014 flags = space.flags;
3015 /*
3016 * return
3017 * 0: if dup, single or RAID0 is configured for
3018 * any of metadata, system or data, else
3019 * 1: if RAID5 is configured, or if RAID1 or
3020 * RAID10 is configured and only two mirrors
3021 * are used, else
3022 * 2: if RAID6 is configured, else
3023 * num_mirrors - 1: if RAID1 or RAID10 is
3024 * configured and more than
3025 * 2 mirrors are used.
3026 */
3027 if (num_tolerated_disk_barrier_failures > 0 &&
3028 ((flags & (BTRFS_BLOCK_GROUP_DUP |
3029 BTRFS_BLOCK_GROUP_RAID0)) ||
3030 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3031 == 0)))
3032 num_tolerated_disk_barrier_failures = 0;
3033 else if (num_tolerated_disk_barrier_failures > 1
3034 &&
3035 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3036 BTRFS_BLOCK_GROUP_RAID10)))
3037 num_tolerated_disk_barrier_failures = 1;
3038 }
3039 }
3040 up_read(&sinfo->groups_sem);
3041 }
3042
3043 return num_tolerated_disk_barrier_failures;
3044}
3045
2971int write_all_supers(struct btrfs_root *root, int max_mirrors) 3046int write_all_supers(struct btrfs_root *root, int max_mirrors)
2972{ 3047{
2973 struct list_head *head; 3048 struct list_head *head;
@@ -2990,8 +3065,16 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2990 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 3065 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2991 head = &root->fs_info->fs_devices->devices; 3066 head = &root->fs_info->fs_devices->devices;
2992 3067
2993 if (do_barriers) 3068 if (do_barriers) {
2994 barrier_all_devices(root->fs_info); 3069 ret = barrier_all_devices(root->fs_info);
3070 if (ret) {
3071 mutex_unlock(
3072 &root->fs_info->fs_devices->device_list_mutex);
3073 btrfs_error(root->fs_info, ret,
3074 "errors while submitting device barriers.");
3075 return ret;
3076 }
3077 }
2995 3078
2996 list_for_each_entry_rcu(dev, head, dev_list) { 3079 list_for_each_entry_rcu(dev, head, dev_list) {
2997 if (!dev->bdev) { 3080 if (!dev->bdev) {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c5b00a735fef..2025a9132c16 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -95,6 +95,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
95 u64 objectid); 95 u64 objectid);
96int btree_lock_page_hook(struct page *page, void *data, 96int btree_lock_page_hook(struct page *page, void *data,
97 void (*flush_fn)(void *)); 97 void (*flush_fn)(void *));
98int btrfs_calc_num_tolerated_disk_barrier_failures(
99 struct btrfs_fs_info *fs_info);
98 100
99#ifdef CONFIG_DEBUG_LOCK_ALLOC 101#ifdef CONFIG_DEBUG_LOCK_ALLOC
100void btrfs_init_lockdep(void); 102void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d6836af6d60f..f5a2e6c4320a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2875,8 +2875,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2875 return 0; 2875 return 0;
2876} 2876}
2877 2877
2878static void get_block_group_info(struct list_head *groups_list, 2878void btrfs_get_block_group_info(struct list_head *groups_list,
2879 struct btrfs_ioctl_space_info *space) 2879 struct btrfs_ioctl_space_info *space)
2880{ 2880{
2881 struct btrfs_block_group_cache *block_group; 2881 struct btrfs_block_group_cache *block_group;
2882 2882
@@ -2984,8 +2984,8 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2984 down_read(&info->groups_sem); 2984 down_read(&info->groups_sem);
2985 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { 2985 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2986 if (!list_empty(&info->block_groups[c])) { 2986 if (!list_empty(&info->block_groups[c])) {
2987 get_block_group_info(&info->block_groups[c], 2987 btrfs_get_block_group_info(
2988 &space); 2988 &info->block_groups[c], &space);
2989 memcpy(dest, &space, sizeof(space)); 2989 memcpy(dest, &space, sizeof(space));
2990 dest++; 2990 dest++;
2991 space_args.total_spaces++; 2991 space_args.total_spaces++;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 47911fd18310..67eab2d4d8a9 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2425,9 +2425,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2425 * in and cause problems either. 2425 * in and cause problems either.
2426 */ 2426 */
2427 btrfs_scrub_pause_super(root); 2427 btrfs_scrub_pause_super(root);
2428 write_ctree_super(trans, root->fs_info->tree_root, 1); 2428 ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
2429 btrfs_scrub_continue_super(root); 2429 btrfs_scrub_continue_super(root);
2430 ret = 0; 2430 if (ret) {
2431 btrfs_abort_transaction(trans, root, ret);
2432 goto out_wake_log_root;
2433 }
2431 2434
2432 mutex_lock(&root->log_mutex); 2435 mutex_lock(&root->log_mutex);
2433 if (root->last_log_commit < log_transid) 2436 if (root->last_log_commit < log_transid)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dfe5e3a22f55..029b903a4ae3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1475,6 +1475,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1475 free_fs_devices(cur_devices); 1475 free_fs_devices(cur_devices);
1476 } 1476 }
1477 1477
1478 root->fs_info->num_tolerated_disk_barrier_failures =
1479 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1480
1478 /* 1481 /*
1479 * at this point, the device is zero sized. We want to 1482 * at this point, the device is zero sized. We want to
1480 * remove it from the devices list and zero out the old super 1483 * remove it from the devices list and zero out the old super
@@ -1799,6 +1802,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1799 btrfs_clear_space_info_full(root->fs_info); 1802 btrfs_clear_space_info_full(root->fs_info);
1800 1803
1801 unlock_chunks(root); 1804 unlock_chunks(root);
1805 root->fs_info->num_tolerated_disk_barrier_failures =
1806 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1802 ret = btrfs_commit_transaction(trans, root); 1807 ret = btrfs_commit_transaction(trans, root);
1803 1808
1804 if (seeding_dev) { 1809 if (seeding_dev) {
@@ -2809,6 +2814,26 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2809 } 2814 }
2810 } 2815 }
2811 2816
2817 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2818 int num_tolerated_disk_barrier_failures;
2819 u64 target = bctl->sys.target;
2820
2821 num_tolerated_disk_barrier_failures =
2822 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2823 if (num_tolerated_disk_barrier_failures > 0 &&
2824 (target &
2825 (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
2826 BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
2827 num_tolerated_disk_barrier_failures = 0;
2828 else if (num_tolerated_disk_barrier_failures > 1 &&
2829 (target &
2830 (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
2831 num_tolerated_disk_barrier_failures = 1;
2832
2833 fs_info->num_tolerated_disk_barrier_failures =
2834 num_tolerated_disk_barrier_failures;
2835 }
2836
2812 ret = insert_balance_item(fs_info->tree_root, bctl); 2837 ret = insert_balance_item(fs_info->tree_root, bctl);
2813 if (ret && ret != -EEXIST) 2838 if (ret && ret != -EEXIST)
2814 goto out; 2839 goto out;
@@ -2841,6 +2866,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2841 __cancel_balance(fs_info); 2866 __cancel_balance(fs_info);
2842 } 2867 }
2843 2868
2869 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
2870 fs_info->num_tolerated_disk_barrier_failures =
2871 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
2872 }
2873
2844 wake_up(&fs_info->balance_wait_q); 2874 wake_up(&fs_info->balance_wait_q);
2845 2875
2846 return ret; 2876 return ret;