aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2010-12-13 14:56:23 -0500
committerChris Mason <chris.mason@oracle.com>2010-12-13 20:06:52 -0500
commitcd02dca56442e1504fd6bc5b96f7f1870162b266 (patch)
tree1a38d99fc581974ba6d8136c42ca81f3b1216ea3 /fs
parent68433b73b104bff388aac376631d32abbbd872b0 (diff)
Btrfs: account for missing devices in RAID allocation profiles
When we mount in RAID degraded mode without adding a new device to replace the failed one, we can end up using the wrong RAID flags for allocations. This results in strange combinations of block groups (raid1 in a raid10 filesystem) and corruptions when we try to allocate blocks from single spindle chunks on drives that are actually missing. The first device has two small 4MB chunks in it that mkfs creates and these are usually unused in a raid1 or raid10 setup. But, in -o degraded, the allocator will fall back to these because the mask of desired raid groups isn't correct. The fix here is to count the missing devices as we build up the list of devices in the system. This count is used when picking the raid level to make sure we continue using the same levels that were in place before we lost a drive. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent-tree.c17
-rw-r--r--fs/btrfs/volumes.c20
-rw-r--r--fs/btrfs/volumes.h2
3 files changed, 36 insertions, 3 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 41133b064d72..4be231e0d2bd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3044,7 +3044,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3044 3044
3045u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3045u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3046{ 3046{
3047 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3047 /*
3048 * we add in the count of missing devices because we want
3049 * to make sure that any RAID levels on a degraded FS
3050 * continue to be honored.
3051 */
3052 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3053 root->fs_info->fs_devices->missing_devices;
3048 3054
3049 if (num_devices == 1) 3055 if (num_devices == 1)
3050 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3056 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -7891,7 +7897,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7891 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7897 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7892 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7898 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7893 7899
7894 num_devices = root->fs_info->fs_devices->rw_devices; 7900 /*
7901 * we add in the count of missing devices because we want
7902 * to make sure that any RAID levels on a degraded FS
7903 * continue to be honored.
7904 */
7905 num_devices = root->fs_info->fs_devices->rw_devices +
7906 root->fs_info->fs_devices->missing_devices;
7907
7895 if (num_devices == 1) { 7908 if (num_devices == 1) {
7896 stripped |= BTRFS_BLOCK_GROUP_DUP; 7909 stripped |= BTRFS_BLOCK_GROUP_DUP;
7897 stripped = flags & ~stripped; 7910 stripped = flags & ~stripped;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 91851b555e2e..177b73179590 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -413,12 +413,16 @@ static noinline int device_list_add(const char *path,
413 413
414 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
415 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) { 416 } else if (!device->name || strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS); 417 name = kstrdup(path, GFP_NOFS);
418 if (!name) 418 if (!name)
419 return -ENOMEM; 419 return -ENOMEM;
420 kfree(device->name); 420 kfree(device->name);
421 device->name = name; 421 device->name = name;
422 if (device->missing) {
423 fs_devices->missing_devices--;
424 device->missing = 0;
425 }
422 } 426 }
423 427
424 if (found_transid > fs_devices->latest_trans) { 428 if (found_transid > fs_devices->latest_trans) {
@@ -1238,6 +1242,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1238 1242
1239 device->fs_devices->num_devices--; 1243 device->fs_devices->num_devices--;
1240 1244
1245 if (device->missing)
1246 root->fs_info->fs_devices->missing_devices--;
1247
1241 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1248 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1242 struct btrfs_device, dev_list); 1249 struct btrfs_device, dev_list);
1243 if (device->bdev == root->fs_info->sb->s_bdev) 1250 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -3084,7 +3091,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 device->devid = devid; 3091 device->devid = devid;
3085 device->work.func = pending_bios_fn; 3092 device->work.func = pending_bios_fn;
3086 device->fs_devices = fs_devices; 3093 device->fs_devices = fs_devices;
3094 device->missing = 1;
3087 fs_devices->num_devices++; 3095 fs_devices->num_devices++;
3096 fs_devices->missing_devices++;
3088 spin_lock_init(&device->io_lock); 3097 spin_lock_init(&device->io_lock);
3089 INIT_LIST_HEAD(&device->dev_alloc_list); 3098 INIT_LIST_HEAD(&device->dev_alloc_list);
3090 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3099 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3282,6 +3291,15 @@ static int read_one_dev(struct btrfs_root *root,
3282 device = add_missing_dev(root, devid, dev_uuid); 3291 device = add_missing_dev(root, devid, dev_uuid);
3283 if (!device) 3292 if (!device)
3284 return -ENOMEM; 3293 return -ENOMEM;
3294 } else if (!device->missing) {
3295 /*
3296 * this happens when a device that was properly setup
3297 * in the device info lists suddenly goes bad.
3298 * device->bdev is NULL, and so we have to set
3299 * device->missing to one here
3300 */
3301 root->fs_info->fs_devices->missing_devices++;
3302 device->missing = 1;
3285 } 3303 }
3286 } 3304 }
3287 3305
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..a668c0116982 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -45,6 +45,7 @@ struct btrfs_device {
45 int barriers; 45 int barriers;
46 int writeable; 46 int writeable;
47 int in_fs_metadata; 47 int in_fs_metadata;
48 int missing;
48 49
49 spinlock_t io_lock; 50 spinlock_t io_lock;
50 51
@@ -94,6 +95,7 @@ struct btrfs_fs_devices {
94 u64 num_devices; 95 u64 num_devices;
95 u64 open_devices; 96 u64 open_devices;
96 u64 rw_devices; 97 u64 rw_devices;
98 u64 missing_devices;
97 u64 total_rw_bytes; 99 u64 total_rw_bytes;
98 struct block_device *latest_bdev; 100 struct block_device *latest_bdev;
99 101