diff options
author | Chris Mason <chris.mason@fusionio.com> | 2013-02-20 14:06:05 -0500 |
---|---|---|
committer | Chris Mason <chris.mason@fusionio.com> | 2013-02-20 14:06:05 -0500 |
commit | e942f883bc6651d50be139477baf6fb0eed3d5bb (patch) | |
tree | e1d19783e9c8b42198a69c17c9719fb90f302847 /fs/btrfs/volumes.c | |
parent | b2c6b3e0611c58fbeb6b9c0892b6249f7bdfaf6b (diff) | |
parent | 0e4e02636611dbf89a2f36320a32054f9936d6cb (diff) |
Merge branch 'raid56-experimental' into for-linus-3.9
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Conflicts:
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/volumes.c
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 380 |
1 files changed, 336 insertions, 44 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72b1cf1b2b5e..7992dc4ea4cc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <linux/raid/pq.h> | ||
29 | #include <asm/div64.h> | ||
28 | #include "compat.h" | 30 | #include "compat.h" |
29 | #include "ctree.h" | 31 | #include "ctree.h" |
30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
@@ -32,6 +34,7 @@ | |||
32 | #include "transaction.h" | 34 | #include "transaction.h" |
33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
34 | #include "volumes.h" | 36 | #include "volumes.h" |
37 | #include "raid56.h" | ||
35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
@@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1465 | goto out; | 1468 | goto out; |
1466 | } | 1469 | } |
1467 | 1470 | ||
1471 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
1472 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1473 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1474 | "devices on raid5\n"); | ||
1475 | ret = -EINVAL; | ||
1476 | goto out; | ||
1477 | } | ||
1478 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
1479 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
1480 | printk(KERN_ERR "btrfs: unable to go below three " | ||
1481 | "devices on raid6\n"); | ||
1482 | ret = -EINVAL; | ||
1483 | goto out; | ||
1484 | } | ||
1485 | |||
1468 | if (strcmp(device_path, "missing") == 0) { | 1486 | if (strcmp(device_path, "missing") == 0) { |
1469 | struct list_head *devices; | 1487 | struct list_head *devices; |
1470 | struct btrfs_device *tmp; | 1488 | struct btrfs_device *tmp; |
@@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
2726 | return 0; | 2744 | return 0; |
2727 | 2745 | ||
2728 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2746 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
2729 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2747 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
2730 | factor = 2; | 2748 | factor = num_stripes / 2; |
2731 | else | 2749 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
2732 | factor = 1; | 2750 | factor = num_stripes - 1; |
2733 | factor = num_stripes / factor; | 2751 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
2752 | factor = num_stripes - 2; | ||
2753 | } else { | ||
2754 | factor = num_stripes; | ||
2755 | } | ||
2734 | 2756 | ||
2735 | for (i = 0; i < num_stripes; i++) { | 2757 | for (i = 0; i < num_stripes; i++) { |
2736 | stripe = btrfs_stripe_nr(chunk, i); | 2758 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3090 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3112 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
3091 | else | 3113 | else |
3092 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3114 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
3093 | BTRFS_BLOCK_GROUP_RAID10); | 3115 | BTRFS_BLOCK_GROUP_RAID10 | |
3116 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3117 | BTRFS_BLOCK_GROUP_RAID6); | ||
3094 | 3118 | ||
3095 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3119 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3096 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3120 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
@@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3130 | 3154 | ||
3131 | /* allow to reduce meta or sys integrity only if force set */ | 3155 | /* allow to reduce meta or sys integrity only if force set */ |
3132 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3156 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
3133 | BTRFS_BLOCK_GROUP_RAID10; | 3157 | BTRFS_BLOCK_GROUP_RAID10 | |
3158 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3159 | BTRFS_BLOCK_GROUP_RAID6; | ||
3134 | do { | 3160 | do { |
3135 | seq = read_seqbegin(&fs_info->profiles_lock); | 3161 | seq = read_seqbegin(&fs_info->profiles_lock); |
3136 | 3162 | ||
@@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3204 | update_ioctl_balance_args(fs_info, 0, bargs); | 3230 | update_ioctl_balance_args(fs_info, 0, bargs); |
3205 | } | 3231 | } |
3206 | 3232 | ||
3207 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
3208 | balance_need_close(fs_info)) { | ||
3209 | __cancel_balance(fs_info); | ||
3210 | } | ||
3211 | |||
3212 | wake_up(&fs_info->balance_wait_q); | 3233 | wake_up(&fs_info->balance_wait_q); |
3213 | 3234 | ||
3214 | return ret; | 3235 | return ret; |
@@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | |||
3611 | .devs_increment = 1, | 3632 | .devs_increment = 1, |
3612 | .ncopies = 1, | 3633 | .ncopies = 1, |
3613 | }, | 3634 | }, |
3635 | [BTRFS_RAID_RAID5] = { | ||
3636 | .sub_stripes = 1, | ||
3637 | .dev_stripes = 1, | ||
3638 | .devs_max = 0, | ||
3639 | .devs_min = 2, | ||
3640 | .devs_increment = 1, | ||
3641 | .ncopies = 2, | ||
3642 | }, | ||
3643 | [BTRFS_RAID_RAID6] = { | ||
3644 | .sub_stripes = 1, | ||
3645 | .dev_stripes = 1, | ||
3646 | .devs_max = 0, | ||
3647 | .devs_min = 3, | ||
3648 | .devs_increment = 1, | ||
3649 | .ncopies = 3, | ||
3650 | }, | ||
3614 | }; | 3651 | }; |
3615 | 3652 | ||
3653 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
3654 | { | ||
3655 | /* TODO allow them to set a preferred stripe size */ | ||
3656 | return 64 * 1024; | ||
3657 | } | ||
3658 | |||
3659 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
3660 | { | ||
3661 | u64 features; | ||
3662 | |||
3663 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
3664 | return; | ||
3665 | |||
3666 | features = btrfs_super_incompat_flags(info->super_copy); | ||
3667 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
3668 | return; | ||
3669 | |||
3670 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
3671 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
3672 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
3673 | } | ||
3674 | |||
3616 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3675 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3617 | struct btrfs_root *extent_root, | 3676 | struct btrfs_root *extent_root, |
3618 | struct map_lookup **map_ret, | 3677 | struct map_lookup **map_ret, |
@@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3628 | struct btrfs_device_info *devices_info = NULL; | 3687 | struct btrfs_device_info *devices_info = NULL; |
3629 | u64 total_avail; | 3688 | u64 total_avail; |
3630 | int num_stripes; /* total number of stripes to allocate */ | 3689 | int num_stripes; /* total number of stripes to allocate */ |
3690 | int data_stripes; /* number of stripes that count for | ||
3691 | block group size */ | ||
3631 | int sub_stripes; /* sub_stripes info for map */ | 3692 | int sub_stripes; /* sub_stripes info for map */ |
3632 | int dev_stripes; /* stripes per dev */ | 3693 | int dev_stripes; /* stripes per dev */ |
3633 | int devs_max; /* max devs to use */ | 3694 | int devs_max; /* max devs to use */ |
@@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3639 | u64 max_chunk_size; | 3700 | u64 max_chunk_size; |
3640 | u64 stripe_size; | 3701 | u64 stripe_size; |
3641 | u64 num_bytes; | 3702 | u64 num_bytes; |
3703 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
3642 | int ndevs; | 3704 | int ndevs; |
3643 | int i; | 3705 | int i; |
3644 | int j; | 3706 | int j; |
@@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3768 | stripe_size = devices_info[ndevs-1].max_avail; | 3830 | stripe_size = devices_info[ndevs-1].max_avail; |
3769 | num_stripes = ndevs * dev_stripes; | 3831 | num_stripes = ndevs * dev_stripes; |
3770 | 3832 | ||
3833 | /* | ||
3834 | * this will have to be fixed for RAID1 and RAID10 over | ||
3835 | * more drives | ||
3836 | */ | ||
3837 | data_stripes = num_stripes / ncopies; | ||
3838 | |||
3771 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3839 | if (stripe_size * ndevs > max_chunk_size * ncopies) { |
3772 | stripe_size = max_chunk_size * ncopies; | 3840 | stripe_size = max_chunk_size * ncopies; |
3773 | do_div(stripe_size, ndevs); | 3841 | do_div(stripe_size, ndevs); |
3774 | } | 3842 | } |
3775 | 3843 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | |
3844 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
3845 | btrfs_super_stripesize(info->super_copy)); | ||
3846 | data_stripes = num_stripes - 1; | ||
3847 | } | ||
3848 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
3849 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
3850 | btrfs_super_stripesize(info->super_copy)); | ||
3851 | data_stripes = num_stripes - 2; | ||
3852 | } | ||
3776 | do_div(stripe_size, dev_stripes); | 3853 | do_div(stripe_size, dev_stripes); |
3777 | 3854 | ||
3778 | /* align to BTRFS_STRIPE_LEN */ | 3855 | /* align to BTRFS_STRIPE_LEN */ |
3779 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3856 | do_div(stripe_size, raid_stripe_len); |
3780 | stripe_size *= BTRFS_STRIPE_LEN; | 3857 | stripe_size *= raid_stripe_len; |
3781 | 3858 | ||
3782 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3859 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
3783 | if (!map) { | 3860 | if (!map) { |
@@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3795 | } | 3872 | } |
3796 | } | 3873 | } |
3797 | map->sector_size = extent_root->sectorsize; | 3874 | map->sector_size = extent_root->sectorsize; |
3798 | map->stripe_len = BTRFS_STRIPE_LEN; | 3875 | map->stripe_len = raid_stripe_len; |
3799 | map->io_align = BTRFS_STRIPE_LEN; | 3876 | map->io_align = raid_stripe_len; |
3800 | map->io_width = BTRFS_STRIPE_LEN; | 3877 | map->io_width = raid_stripe_len; |
3801 | map->type = type; | 3878 | map->type = type; |
3802 | map->sub_stripes = sub_stripes; | 3879 | map->sub_stripes = sub_stripes; |
3803 | 3880 | ||
3804 | *map_ret = map; | 3881 | *map_ret = map; |
3805 | num_bytes = stripe_size * (num_stripes / ncopies); | 3882 | num_bytes = stripe_size * data_stripes; |
3806 | 3883 | ||
3807 | *stripe_size_out = stripe_size; | 3884 | *stripe_size_out = stripe_size; |
3808 | *num_bytes_out = num_bytes; | 3885 | *num_bytes_out = num_bytes; |
@@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3853 | } | 3930 | } |
3854 | 3931 | ||
3855 | free_extent_map(em); | 3932 | free_extent_map(em); |
3933 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
3934 | |||
3856 | kfree(devices_info); | 3935 | kfree(devices_info); |
3857 | return 0; | 3936 | return 0; |
3858 | 3937 | ||
@@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4136 | ret = map->num_stripes; | 4215 | ret = map->num_stripes; |
4137 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4216 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4138 | ret = map->sub_stripes; | 4217 | ret = map->sub_stripes; |
4218 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
4219 | ret = 2; | ||
4220 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4221 | ret = 3; | ||
4139 | else | 4222 | else |
4140 | ret = 1; | 4223 | ret = 1; |
4141 | free_extent_map(em); | 4224 | free_extent_map(em); |
@@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4148 | return ret; | 4231 | return ret; |
4149 | } | 4232 | } |
4150 | 4233 | ||
4234 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
4235 | struct btrfs_mapping_tree *map_tree, | ||
4236 | u64 logical) | ||
4237 | { | ||
4238 | struct extent_map *em; | ||
4239 | struct map_lookup *map; | ||
4240 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4241 | unsigned long len = root->sectorsize; | ||
4242 | |||
4243 | read_lock(&em_tree->lock); | ||
4244 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4245 | read_unlock(&em_tree->lock); | ||
4246 | BUG_ON(!em); | ||
4247 | |||
4248 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4249 | map = (struct map_lookup *)em->bdev; | ||
4250 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4251 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4252 | len = map->stripe_len * nr_data_stripes(map); | ||
4253 | } | ||
4254 | free_extent_map(em); | ||
4255 | return len; | ||
4256 | } | ||
4257 | |||
4258 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
4259 | u64 logical, u64 len, int mirror_num) | ||
4260 | { | ||
4261 | struct extent_map *em; | ||
4262 | struct map_lookup *map; | ||
4263 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4264 | int ret = 0; | ||
4265 | |||
4266 | read_lock(&em_tree->lock); | ||
4267 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4268 | read_unlock(&em_tree->lock); | ||
4269 | BUG_ON(!em); | ||
4270 | |||
4271 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4272 | map = (struct map_lookup *)em->bdev; | ||
4273 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4274 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4275 | ret = 1; | ||
4276 | free_extent_map(em); | ||
4277 | return ret; | ||
4278 | } | ||
4279 | |||
4151 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4280 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
4152 | struct map_lookup *map, int first, int num, | 4281 | struct map_lookup *map, int first, int num, |
4153 | int optimal, int dev_replace_is_ongoing) | 4282 | int optimal, int dev_replace_is_ongoing) |
@@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
4185 | return optimal; | 4314 | return optimal; |
4186 | } | 4315 | } |
4187 | 4316 | ||
4317 | static inline int parity_smaller(u64 a, u64 b) | ||
4318 | { | ||
4319 | return a > b; | ||
4320 | } | ||
4321 | |||
4322 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
4323 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
4324 | { | ||
4325 | struct btrfs_bio_stripe s; | ||
4326 | int i; | ||
4327 | u64 l; | ||
4328 | int again = 1; | ||
4329 | |||
4330 | while (again) { | ||
4331 | again = 0; | ||
4332 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
4333 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
4334 | s = bbio->stripes[i]; | ||
4335 | l = raid_map[i]; | ||
4336 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
4337 | raid_map[i] = raid_map[i+1]; | ||
4338 | bbio->stripes[i+1] = s; | ||
4339 | raid_map[i+1] = l; | ||
4340 | again = 1; | ||
4341 | } | ||
4342 | } | ||
4343 | } | ||
4344 | } | ||
4345 | |||
4188 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4346 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4189 | u64 logical, u64 *length, | 4347 | u64 logical, u64 *length, |
4190 | struct btrfs_bio **bbio_ret, | 4348 | struct btrfs_bio **bbio_ret, |
4191 | int mirror_num) | 4349 | int mirror_num, u64 **raid_map_ret) |
4192 | { | 4350 | { |
4193 | struct extent_map *em; | 4351 | struct extent_map *em; |
4194 | struct map_lookup *map; | 4352 | struct map_lookup *map; |
@@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4200 | u64 stripe_nr; | 4358 | u64 stripe_nr; |
4201 | u64 stripe_nr_orig; | 4359 | u64 stripe_nr_orig; |
4202 | u64 stripe_nr_end; | 4360 | u64 stripe_nr_end; |
4361 | u64 stripe_len; | ||
4362 | u64 *raid_map = NULL; | ||
4203 | int stripe_index; | 4363 | int stripe_index; |
4204 | int i; | 4364 | int i; |
4205 | int ret = 0; | 4365 | int ret = 0; |
@@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4211 | int num_alloc_stripes; | 4371 | int num_alloc_stripes; |
4212 | int patch_the_first_stripe_for_dev_replace = 0; | 4372 | int patch_the_first_stripe_for_dev_replace = 0; |
4213 | u64 physical_to_patch_in_first_stripe = 0; | 4373 | u64 physical_to_patch_in_first_stripe = 0; |
4374 | u64 raid56_full_stripe_start = (u64)-1; | ||
4214 | 4375 | ||
4215 | read_lock(&em_tree->lock); | 4376 | read_lock(&em_tree->lock); |
4216 | em = lookup_extent_mapping(em_tree, logical, *length); | 4377 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4227 | map = (struct map_lookup *)em->bdev; | 4388 | map = (struct map_lookup *)em->bdev; |
4228 | offset = logical - em->start; | 4389 | offset = logical - em->start; |
4229 | 4390 | ||
4391 | if (mirror_num > map->num_stripes) | ||
4392 | mirror_num = 0; | ||
4393 | |||
4394 | stripe_len = map->stripe_len; | ||
4230 | stripe_nr = offset; | 4395 | stripe_nr = offset; |
4231 | /* | 4396 | /* |
4232 | * stripe_nr counts the total number of stripes we have to stride | 4397 | * stripe_nr counts the total number of stripes we have to stride |
4233 | * to get to this block | 4398 | * to get to this block |
4234 | */ | 4399 | */ |
4235 | do_div(stripe_nr, map->stripe_len); | 4400 | do_div(stripe_nr, stripe_len); |
4236 | 4401 | ||
4237 | stripe_offset = stripe_nr * map->stripe_len; | 4402 | stripe_offset = stripe_nr * stripe_len; |
4238 | BUG_ON(offset < stripe_offset); | 4403 | BUG_ON(offset < stripe_offset); |
4239 | 4404 | ||
4240 | /* stripe_offset is the offset of this block in its stripe*/ | 4405 | /* stripe_offset is the offset of this block in its stripe*/ |
4241 | stripe_offset = offset - stripe_offset; | 4406 | stripe_offset = offset - stripe_offset; |
4242 | 4407 | ||
4243 | if (rw & REQ_DISCARD) | 4408 | /* if we're here for raid56, we need to know the stripe aligned start */ |
4409 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4410 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
4411 | raid56_full_stripe_start = offset; | ||
4412 | |||
4413 | /* allow a write of a full stripe, but make sure we don't | ||
4414 | * allow straddling of stripes | ||
4415 | */ | ||
4416 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
4417 | raid56_full_stripe_start *= full_stripe_len; | ||
4418 | } | ||
4419 | |||
4420 | if (rw & REQ_DISCARD) { | ||
4421 | /* we don't discard raid56 yet */ | ||
4422 | if (map->type & | ||
4423 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4424 | ret = -EOPNOTSUPP; | ||
4425 | goto out; | ||
4426 | } | ||
4244 | *length = min_t(u64, em->len - offset, *length); | 4427 | *length = min_t(u64, em->len - offset, *length); |
4245 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4428 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
4246 | /* we limit the length of each bio to what fits in a stripe */ | 4429 | u64 max_len; |
4247 | *length = min_t(u64, em->len - offset, | 4430 | /* For writes to RAID[56], allow a full stripeset across all disks. |
4248 | map->stripe_len - stripe_offset); | 4431 | For other RAID types and for RAID[56] reads, just allow a single |
4432 | stripe (on a single disk). */ | ||
4433 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
4434 | (rw & REQ_WRITE)) { | ||
4435 | max_len = stripe_len * nr_data_stripes(map) - | ||
4436 | (offset - raid56_full_stripe_start); | ||
4437 | } else { | ||
4438 | /* we limit the length of each bio to what fits in a stripe */ | ||
4439 | max_len = stripe_len - stripe_offset; | ||
4440 | } | ||
4441 | *length = min_t(u64, em->len - offset, max_len); | ||
4249 | } else { | 4442 | } else { |
4250 | *length = em->len - offset; | 4443 | *length = em->len - offset; |
4251 | } | 4444 | } |
4252 | 4445 | ||
4446 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
4447 | it cares about is the length */ | ||
4253 | if (!bbio_ret) | 4448 | if (!bbio_ret) |
4254 | goto out; | 4449 | goto out; |
4255 | 4450 | ||
@@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4282 | u64 physical_of_found = 0; | 4477 | u64 physical_of_found = 0; |
4283 | 4478 | ||
4284 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4479 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
4285 | logical, &tmp_length, &tmp_bbio, 0); | 4480 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
4286 | if (ret) { | 4481 | if (ret) { |
4287 | WARN_ON(tmp_bbio != NULL); | 4482 | WARN_ON(tmp_bbio != NULL); |
4288 | goto out; | 4483 | goto out; |
@@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4348 | do_div(stripe_nr_end, map->stripe_len); | 4543 | do_div(stripe_nr_end, map->stripe_len); |
4349 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4544 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
4350 | (offset + *length); | 4545 | (offset + *length); |
4546 | |||
4351 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4547 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4352 | if (rw & REQ_DISCARD) | 4548 | if (rw & REQ_DISCARD) |
4353 | num_stripes = min_t(u64, map->num_stripes, | 4549 | num_stripes = min_t(u64, map->num_stripes, |
@@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4398 | dev_replace_is_ongoing); | 4594 | dev_replace_is_ongoing); |
4399 | mirror_num = stripe_index - old_stripe_index + 1; | 4595 | mirror_num = stripe_index - old_stripe_index + 1; |
4400 | } | 4596 | } |
4597 | |||
4598 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4599 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4600 | u64 tmp; | ||
4601 | |||
4602 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
4603 | && raid_map_ret) { | ||
4604 | int i, rot; | ||
4605 | |||
4606 | /* push stripe_nr back to the start of the full stripe */ | ||
4607 | stripe_nr = raid56_full_stripe_start; | ||
4608 | do_div(stripe_nr, stripe_len); | ||
4609 | |||
4610 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4611 | |||
4612 | /* RAID[56] write or recovery. Return all stripes */ | ||
4613 | num_stripes = map->num_stripes; | ||
4614 | max_errors = nr_parity_stripes(map); | ||
4615 | |||
4616 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
4617 | GFP_NOFS); | ||
4618 | if (!raid_map) { | ||
4619 | ret = -ENOMEM; | ||
4620 | goto out; | ||
4621 | } | ||
4622 | |||
4623 | /* Work out the disk rotation on this stripe-set */ | ||
4624 | tmp = stripe_nr; | ||
4625 | rot = do_div(tmp, num_stripes); | ||
4626 | |||
4627 | /* Fill in the logical address of each stripe */ | ||
4628 | tmp = stripe_nr * nr_data_stripes(map); | ||
4629 | for (i = 0; i < nr_data_stripes(map); i++) | ||
4630 | raid_map[(i+rot) % num_stripes] = | ||
4631 | em->start + (tmp + i) * map->stripe_len; | ||
4632 | |||
4633 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
4634 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4635 | raid_map[(i+rot+1) % num_stripes] = | ||
4636 | RAID6_Q_STRIPE; | ||
4637 | |||
4638 | *length = map->stripe_len; | ||
4639 | stripe_index = 0; | ||
4640 | stripe_offset = 0; | ||
4641 | } else { | ||
4642 | /* | ||
4643 | * Mirror #0 or #1 means the original data block. | ||
4644 | * Mirror #2 is RAID5 parity block. | ||
4645 | * Mirror #3 is RAID6 Q block. | ||
4646 | */ | ||
4647 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4648 | if (mirror_num > 1) | ||
4649 | stripe_index = nr_data_stripes(map) + | ||
4650 | mirror_num - 2; | ||
4651 | |||
4652 | /* We distribute the parity blocks across stripes */ | ||
4653 | tmp = stripe_nr + stripe_index; | ||
4654 | stripe_index = do_div(tmp, map->num_stripes); | ||
4655 | } | ||
4401 | } else { | 4656 | } else { |
4402 | /* | 4657 | /* |
4403 | * after this do_div call, stripe_nr is the number of stripes | 4658 | * after this do_div call, stripe_nr is the number of stripes |
@@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4506 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4761 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4507 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4762 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4508 | BTRFS_BLOCK_GROUP_RAID10 | | 4763 | BTRFS_BLOCK_GROUP_RAID10 | |
4764 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4509 | BTRFS_BLOCK_GROUP_DUP)) { | 4765 | BTRFS_BLOCK_GROUP_DUP)) { |
4510 | max_errors = 1; | 4766 | max_errors = 1; |
4767 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
4768 | max_errors = 2; | ||
4511 | } | 4769 | } |
4512 | } | 4770 | } |
4513 | 4771 | ||
@@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4608 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4866 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
4609 | bbio->mirror_num = map->num_stripes + 1; | 4867 | bbio->mirror_num = map->num_stripes + 1; |
4610 | } | 4868 | } |
4869 | if (raid_map) { | ||
4870 | sort_parity_stripes(bbio, raid_map); | ||
4871 | *raid_map_ret = raid_map; | ||
4872 | } | ||
4611 | out: | 4873 | out: |
4612 | if (dev_replace_is_ongoing) | 4874 | if (dev_replace_is_ongoing) |
4613 | btrfs_dev_replace_unlock(dev_replace); | 4875 | btrfs_dev_replace_unlock(dev_replace); |
@@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4620 | struct btrfs_bio **bbio_ret, int mirror_num) | 4882 | struct btrfs_bio **bbio_ret, int mirror_num) |
4621 | { | 4883 | { |
4622 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4884 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4623 | mirror_num); | 4885 | mirror_num, NULL); |
4624 | } | 4886 | } |
4625 | 4887 | ||
4626 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4888 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4634 | u64 bytenr; | 4896 | u64 bytenr; |
4635 | u64 length; | 4897 | u64 length; |
4636 | u64 stripe_nr; | 4898 | u64 stripe_nr; |
4899 | u64 rmap_len; | ||
4637 | int i, j, nr = 0; | 4900 | int i, j, nr = 0; |
4638 | 4901 | ||
4639 | read_lock(&em_tree->lock); | 4902 | read_lock(&em_tree->lock); |
@@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4644 | map = (struct map_lookup *)em->bdev; | 4907 | map = (struct map_lookup *)em->bdev; |
4645 | 4908 | ||
4646 | length = em->len; | 4909 | length = em->len; |
4910 | rmap_len = map->stripe_len; | ||
4911 | |||
4647 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4912 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4648 | do_div(length, map->num_stripes / map->sub_stripes); | 4913 | do_div(length, map->num_stripes / map->sub_stripes); |
4649 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4914 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
4650 | do_div(length, map->num_stripes); | 4915 | do_div(length, map->num_stripes); |
4916 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4917 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4918 | do_div(length, nr_data_stripes(map)); | ||
4919 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
4920 | } | ||
4651 | 4921 | ||
4652 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4922 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
4653 | BUG_ON(!buf); /* -ENOMEM */ | 4923 | BUG_ON(!buf); /* -ENOMEM */ |
@@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4667 | do_div(stripe_nr, map->sub_stripes); | 4937 | do_div(stripe_nr, map->sub_stripes); |
4668 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4938 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4669 | stripe_nr = stripe_nr * map->num_stripes + i; | 4939 | stripe_nr = stripe_nr * map->num_stripes + i; |
4670 | } | 4940 | } /* else if RAID[56], multiply by nr_data_stripes(). |
4671 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4941 | * Alternatively, just use rmap_len below instead of |
4942 | * map->stripe_len */ | ||
4943 | |||
4944 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
4672 | WARN_ON(nr >= map->num_stripes); | 4945 | WARN_ON(nr >= map->num_stripes); |
4673 | for (j = 0; j < nr; j++) { | 4946 | for (j = 0; j < nr; j++) { |
4674 | if (buf[j] == bytenr) | 4947 | if (buf[j] == bytenr) |
@@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4682 | 4955 | ||
4683 | *logical = buf; | 4956 | *logical = buf; |
4684 | *naddrs = nr; | 4957 | *naddrs = nr; |
4685 | *stripe_len = map->stripe_len; | 4958 | *stripe_len = rmap_len; |
4686 | 4959 | ||
4687 | free_extent_map(em); | 4960 | free_extent_map(em); |
4688 | return 0; | 4961 | return 0; |
@@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
4756 | bio->bi_bdev = (struct block_device *) | 5029 | bio->bi_bdev = (struct block_device *) |
4757 | (unsigned long)bbio->mirror_num; | 5030 | (unsigned long)bbio->mirror_num; |
4758 | /* only send an error to the higher layers if it is | 5031 | /* only send an error to the higher layers if it is |
4759 | * beyond the tolerance of the multi-bio | 5032 | * beyond the tolerance of the btrfs bio |
4760 | */ | 5033 | */ |
4761 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 5034 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
4762 | err = -EIO; | 5035 | err = -EIO; |
@@ -4790,13 +5063,18 @@ struct async_sched { | |||
4790 | * This will add one bio to the pending list for a device and make sure | 5063 | * This will add one bio to the pending list for a device and make sure |
4791 | * the work struct is scheduled. | 5064 | * the work struct is scheduled. |
4792 | */ | 5065 | */ |
4793 | static noinline void schedule_bio(struct btrfs_root *root, | 5066 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
4794 | struct btrfs_device *device, | 5067 | struct btrfs_device *device, |
4795 | int rw, struct bio *bio) | 5068 | int rw, struct bio *bio) |
4796 | { | 5069 | { |
4797 | int should_queue = 1; | 5070 | int should_queue = 1; |
4798 | struct btrfs_pending_bios *pending_bios; | 5071 | struct btrfs_pending_bios *pending_bios; |
4799 | 5072 | ||
5073 | if (device->missing || !device->bdev) { | ||
5074 | bio_endio(bio, -EIO); | ||
5075 | return; | ||
5076 | } | ||
5077 | |||
4800 | /* don't bother with additional async steps for reads, right now */ | 5078 | /* don't bother with additional async steps for reads, right now */ |
4801 | if (!(rw & REQ_WRITE)) { | 5079 | if (!(rw & REQ_WRITE)) { |
4802 | bio_get(bio); | 5080 | bio_get(bio); |
@@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
4894 | #endif | 5172 | #endif |
4895 | bio->bi_bdev = dev->bdev; | 5173 | bio->bi_bdev = dev->bdev; |
4896 | if (async) | 5174 | if (async) |
4897 | schedule_bio(root, dev, rw, bio); | 5175 | btrfs_schedule_bio(root, dev, rw, bio); |
4898 | else | 5176 | else |
4899 | btrfsic_submit_bio(rw, bio); | 5177 | btrfsic_submit_bio(rw, bio); |
4900 | } | 5178 | } |
@@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4953 | u64 logical = (u64)bio->bi_sector << 9; | 5231 | u64 logical = (u64)bio->bi_sector << 9; |
4954 | u64 length = 0; | 5232 | u64 length = 0; |
4955 | u64 map_length; | 5233 | u64 map_length; |
5234 | u64 *raid_map = NULL; | ||
4956 | int ret; | 5235 | int ret; |
4957 | int dev_nr = 0; | 5236 | int dev_nr = 0; |
4958 | int total_devs = 1; | 5237 | int total_devs = 1; |
@@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4961 | length = bio->bi_size; | 5240 | length = bio->bi_size; |
4962 | map_length = length; | 5241 | map_length = length; |
4963 | 5242 | ||
4964 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5243 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4965 | mirror_num); | 5244 | mirror_num, &raid_map); |
4966 | if (ret) | 5245 | if (ret) /* -ENOMEM */ |
4967 | return ret; | 5246 | return ret; |
4968 | 5247 | ||
4969 | total_devs = bbio->num_stripes; | 5248 | total_devs = bbio->num_stripes; |
5249 | bbio->orig_bio = first_bio; | ||
5250 | bbio->private = first_bio->bi_private; | ||
5251 | bbio->end_io = first_bio->bi_end_io; | ||
5252 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
5253 | |||
5254 | if (raid_map) { | ||
5255 | /* In this case, map_length has been set to the length of | ||
5256 | a single stripe; not the whole write */ | ||
5257 | if (rw & WRITE) { | ||
5258 | return raid56_parity_write(root, bio, bbio, | ||
5259 | raid_map, map_length); | ||
5260 | } else { | ||
5261 | return raid56_parity_recover(root, bio, bbio, | ||
5262 | raid_map, map_length, | ||
5263 | mirror_num); | ||
5264 | } | ||
5265 | } | ||
5266 | |||
4970 | if (map_length < length) { | 5267 | if (map_length < length) { |
4971 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5268 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
4972 | "len %llu\n", (unsigned long long)logical, | 5269 | "len %llu\n", (unsigned long long)logical, |
@@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4975 | BUG(); | 5272 | BUG(); |
4976 | } | 5273 | } |
4977 | 5274 | ||
4978 | bbio->orig_bio = first_bio; | ||
4979 | bbio->private = first_bio->bi_private; | ||
4980 | bbio->end_io = first_bio->bi_end_io; | ||
4981 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
4982 | |||
4983 | while (dev_nr < total_devs) { | 5275 | while (dev_nr < total_devs) { |
4984 | dev = bbio->stripes[dev_nr].dev; | 5276 | dev = bbio->stripes[dev_nr].dev; |
4985 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5277 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |