aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@fusionio.com>2013-02-20 14:06:05 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-20 14:06:05 -0500
commite942f883bc6651d50be139477baf6fb0eed3d5bb (patch)
treee1d19783e9c8b42198a69c17c9719fb90f302847 /fs/btrfs/volumes.c
parentb2c6b3e0611c58fbeb6b9c0892b6249f7bdfaf6b (diff)
parent0e4e02636611dbf89a2f36320a32054f9936d6cb (diff)
Merge branch 'raid56-experimental' into for-linus-3.9
Signed-off-by: Chris Mason <chris.mason@fusionio.com> Conflicts: fs/btrfs/ctree.h fs/btrfs/extent-tree.c fs/btrfs/inode.c fs/btrfs/volumes.c
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c380
1 files changed, 336 insertions, 44 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 72b1cf1b2b5e..7992dc4ea4cc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1465 goto out; 1468 goto out;
1466 } 1469 }
1467 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1468 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1469 struct list_head *devices; 1487 struct list_head *devices;
1470 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2726 return 0; 2744 return 0;
2727 2745
2728 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2746 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2729 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2747 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2730 factor = 2; 2748 factor = num_stripes / 2;
2731 else 2749 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2732 factor = 1; 2750 factor = num_stripes - 1;
2733 factor = num_stripes / factor; 2751 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2752 factor = num_stripes - 2;
2753 } else {
2754 factor = num_stripes;
2755 }
2734 2756
2735 for (i = 0; i < num_stripes; i++) { 2757 for (i = 0; i < num_stripes; i++) {
2736 stripe = btrfs_stripe_nr(chunk, i); 2758 stripe = btrfs_stripe_nr(chunk, i);
@@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3090 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3112 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3091 else 3113 else
3092 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3114 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3093 BTRFS_BLOCK_GROUP_RAID10); 3115 BTRFS_BLOCK_GROUP_RAID10 |
3116 BTRFS_BLOCK_GROUP_RAID5 |
3117 BTRFS_BLOCK_GROUP_RAID6);
3094 3118
3095 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3119 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3096 (!alloc_profile_is_valid(bctl->data.target, 1) || 3120 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3130 3154
3131 /* allow to reduce meta or sys integrity only if force set */ 3155 /* allow to reduce meta or sys integrity only if force set */
3132 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3156 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3133 BTRFS_BLOCK_GROUP_RAID10; 3157 BTRFS_BLOCK_GROUP_RAID10 |
3158 BTRFS_BLOCK_GROUP_RAID5 |
3159 BTRFS_BLOCK_GROUP_RAID6;
3134 do { 3160 do {
3135 seq = read_seqbegin(&fs_info->profiles_lock); 3161 seq = read_seqbegin(&fs_info->profiles_lock);
3136 3162
@@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3204 update_ioctl_balance_args(fs_info, 0, bargs); 3230 update_ioctl_balance_args(fs_info, 0, bargs);
3205 } 3231 }
3206 3232
3207 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3208 balance_need_close(fs_info)) {
3209 __cancel_balance(fs_info);
3210 }
3211
3212 wake_up(&fs_info->balance_wait_q); 3233 wake_up(&fs_info->balance_wait_q);
3213 3234
3214 return ret; 3235 return ret;
@@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3611 .devs_increment = 1, 3632 .devs_increment = 1,
3612 .ncopies = 1, 3633 .ncopies = 1,
3613 }, 3634 },
3635 [BTRFS_RAID_RAID5] = {
3636 .sub_stripes = 1,
3637 .dev_stripes = 1,
3638 .devs_max = 0,
3639 .devs_min = 2,
3640 .devs_increment = 1,
3641 .ncopies = 2,
3642 },
3643 [BTRFS_RAID_RAID6] = {
3644 .sub_stripes = 1,
3645 .dev_stripes = 1,
3646 .devs_max = 0,
3647 .devs_min = 3,
3648 .devs_increment = 1,
3649 .ncopies = 3,
3650 },
3614}; 3651};
3615 3652
3653static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3654{
3655 /* TODO allow them to set a preferred stripe size */
3656 return 64 * 1024;
3657}
3658
3659static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3660{
3661 u64 features;
3662
3663 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3664 return;
3665
3666 features = btrfs_super_incompat_flags(info->super_copy);
3667 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3668 return;
3669
3670 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3671 btrfs_set_super_incompat_flags(info->super_copy, features);
3672 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3673}
3674
3616static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3675static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *extent_root, 3676 struct btrfs_root *extent_root,
3618 struct map_lookup **map_ret, 3677 struct map_lookup **map_ret,
@@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3628 struct btrfs_device_info *devices_info = NULL; 3687 struct btrfs_device_info *devices_info = NULL;
3629 u64 total_avail; 3688 u64 total_avail;
3630 int num_stripes; /* total number of stripes to allocate */ 3689 int num_stripes; /* total number of stripes to allocate */
3690 int data_stripes; /* number of stripes that count for
3691 block group size */
3631 int sub_stripes; /* sub_stripes info for map */ 3692 int sub_stripes; /* sub_stripes info for map */
3632 int dev_stripes; /* stripes per dev */ 3693 int dev_stripes; /* stripes per dev */
3633 int devs_max; /* max devs to use */ 3694 int devs_max; /* max devs to use */
@@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3639 u64 max_chunk_size; 3700 u64 max_chunk_size;
3640 u64 stripe_size; 3701 u64 stripe_size;
3641 u64 num_bytes; 3702 u64 num_bytes;
3703 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3642 int ndevs; 3704 int ndevs;
3643 int i; 3705 int i;
3644 int j; 3706 int j;
@@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3768 stripe_size = devices_info[ndevs-1].max_avail; 3830 stripe_size = devices_info[ndevs-1].max_avail;
3769 num_stripes = ndevs * dev_stripes; 3831 num_stripes = ndevs * dev_stripes;
3770 3832
3833 /*
3834 * this will have to be fixed for RAID1 and RAID10 over
3835 * more drives
3836 */
3837 data_stripes = num_stripes / ncopies;
3838
3771 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3839 if (stripe_size * ndevs > max_chunk_size * ncopies) {
3772 stripe_size = max_chunk_size * ncopies; 3840 stripe_size = max_chunk_size * ncopies;
3773 do_div(stripe_size, ndevs); 3841 do_div(stripe_size, ndevs);
3774 } 3842 }
3775 3843 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3844 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3845 btrfs_super_stripesize(info->super_copy));
3846 data_stripes = num_stripes - 1;
3847 }
3848 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3849 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3850 btrfs_super_stripesize(info->super_copy));
3851 data_stripes = num_stripes - 2;
3852 }
3776 do_div(stripe_size, dev_stripes); 3853 do_div(stripe_size, dev_stripes);
3777 3854
3778 /* align to BTRFS_STRIPE_LEN */ 3855 /* align to BTRFS_STRIPE_LEN */
3779 do_div(stripe_size, BTRFS_STRIPE_LEN); 3856 do_div(stripe_size, raid_stripe_len);
3780 stripe_size *= BTRFS_STRIPE_LEN; 3857 stripe_size *= raid_stripe_len;
3781 3858
3782 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3859 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3783 if (!map) { 3860 if (!map) {
@@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3795 } 3872 }
3796 } 3873 }
3797 map->sector_size = extent_root->sectorsize; 3874 map->sector_size = extent_root->sectorsize;
3798 map->stripe_len = BTRFS_STRIPE_LEN; 3875 map->stripe_len = raid_stripe_len;
3799 map->io_align = BTRFS_STRIPE_LEN; 3876 map->io_align = raid_stripe_len;
3800 map->io_width = BTRFS_STRIPE_LEN; 3877 map->io_width = raid_stripe_len;
3801 map->type = type; 3878 map->type = type;
3802 map->sub_stripes = sub_stripes; 3879 map->sub_stripes = sub_stripes;
3803 3880
3804 *map_ret = map; 3881 *map_ret = map;
3805 num_bytes = stripe_size * (num_stripes / ncopies); 3882 num_bytes = stripe_size * data_stripes;
3806 3883
3807 *stripe_size_out = stripe_size; 3884 *stripe_size_out = stripe_size;
3808 *num_bytes_out = num_bytes; 3885 *num_bytes_out = num_bytes;
@@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3853 } 3930 }
3854 3931
3855 free_extent_map(em); 3932 free_extent_map(em);
3933 check_raid56_incompat_flag(extent_root->fs_info, type);
3934
3856 kfree(devices_info); 3935 kfree(devices_info);
3857 return 0; 3936 return 0;
3858 3937
@@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4136 ret = map->num_stripes; 4215 ret = map->num_stripes;
4137 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4216 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4138 ret = map->sub_stripes; 4217 ret = map->sub_stripes;
4218 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4219 ret = 2;
4220 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4221 ret = 3;
4139 else 4222 else
4140 ret = 1; 4223 ret = 1;
4141 free_extent_map(em); 4224 free_extent_map(em);
@@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4148 return ret; 4231 return ret;
4149} 4232}
4150 4233
4234unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4235 struct btrfs_mapping_tree *map_tree,
4236 u64 logical)
4237{
4238 struct extent_map *em;
4239 struct map_lookup *map;
4240 struct extent_map_tree *em_tree = &map_tree->map_tree;
4241 unsigned long len = root->sectorsize;
4242
4243 read_lock(&em_tree->lock);
4244 em = lookup_extent_mapping(em_tree, logical, len);
4245 read_unlock(&em_tree->lock);
4246 BUG_ON(!em);
4247
4248 BUG_ON(em->start > logical || em->start + em->len < logical);
4249 map = (struct map_lookup *)em->bdev;
4250 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4251 BTRFS_BLOCK_GROUP_RAID6)) {
4252 len = map->stripe_len * nr_data_stripes(map);
4253 }
4254 free_extent_map(em);
4255 return len;
4256}
4257
4258int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4259 u64 logical, u64 len, int mirror_num)
4260{
4261 struct extent_map *em;
4262 struct map_lookup *map;
4263 struct extent_map_tree *em_tree = &map_tree->map_tree;
4264 int ret = 0;
4265
4266 read_lock(&em_tree->lock);
4267 em = lookup_extent_mapping(em_tree, logical, len);
4268 read_unlock(&em_tree->lock);
4269 BUG_ON(!em);
4270
4271 BUG_ON(em->start > logical || em->start + em->len < logical);
4272 map = (struct map_lookup *)em->bdev;
4273 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4274 BTRFS_BLOCK_GROUP_RAID6))
4275 ret = 1;
4276 free_extent_map(em);
4277 return ret;
4278}
4279
4151static int find_live_mirror(struct btrfs_fs_info *fs_info, 4280static int find_live_mirror(struct btrfs_fs_info *fs_info,
4152 struct map_lookup *map, int first, int num, 4281 struct map_lookup *map, int first, int num,
4153 int optimal, int dev_replace_is_ongoing) 4282 int optimal, int dev_replace_is_ongoing)
@@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4185 return optimal; 4314 return optimal;
4186} 4315}
4187 4316
4317static inline int parity_smaller(u64 a, u64 b)
4318{
4319 return a > b;
4320}
4321
4322/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4323static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4324{
4325 struct btrfs_bio_stripe s;
4326 int i;
4327 u64 l;
4328 int again = 1;
4329
4330 while (again) {
4331 again = 0;
4332 for (i = 0; i < bbio->num_stripes - 1; i++) {
4333 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4334 s = bbio->stripes[i];
4335 l = raid_map[i];
4336 bbio->stripes[i] = bbio->stripes[i+1];
4337 raid_map[i] = raid_map[i+1];
4338 bbio->stripes[i+1] = s;
4339 raid_map[i+1] = l;
4340 again = 1;
4341 }
4342 }
4343 }
4344}
4345
4188static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4346static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4189 u64 logical, u64 *length, 4347 u64 logical, u64 *length,
4190 struct btrfs_bio **bbio_ret, 4348 struct btrfs_bio **bbio_ret,
4191 int mirror_num) 4349 int mirror_num, u64 **raid_map_ret)
4192{ 4350{
4193 struct extent_map *em; 4351 struct extent_map *em;
4194 struct map_lookup *map; 4352 struct map_lookup *map;
@@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4200 u64 stripe_nr; 4358 u64 stripe_nr;
4201 u64 stripe_nr_orig; 4359 u64 stripe_nr_orig;
4202 u64 stripe_nr_end; 4360 u64 stripe_nr_end;
4361 u64 stripe_len;
4362 u64 *raid_map = NULL;
4203 int stripe_index; 4363 int stripe_index;
4204 int i; 4364 int i;
4205 int ret = 0; 4365 int ret = 0;
@@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4211 int num_alloc_stripes; 4371 int num_alloc_stripes;
4212 int patch_the_first_stripe_for_dev_replace = 0; 4372 int patch_the_first_stripe_for_dev_replace = 0;
4213 u64 physical_to_patch_in_first_stripe = 0; 4373 u64 physical_to_patch_in_first_stripe = 0;
4374 u64 raid56_full_stripe_start = (u64)-1;
4214 4375
4215 read_lock(&em_tree->lock); 4376 read_lock(&em_tree->lock);
4216 em = lookup_extent_mapping(em_tree, logical, *length); 4377 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4227 map = (struct map_lookup *)em->bdev; 4388 map = (struct map_lookup *)em->bdev;
4228 offset = logical - em->start; 4389 offset = logical - em->start;
4229 4390
4391 if (mirror_num > map->num_stripes)
4392 mirror_num = 0;
4393
4394 stripe_len = map->stripe_len;
4230 stripe_nr = offset; 4395 stripe_nr = offset;
4231 /* 4396 /*
4232 * stripe_nr counts the total number of stripes we have to stride 4397 * stripe_nr counts the total number of stripes we have to stride
4233 * to get to this block 4398 * to get to this block
4234 */ 4399 */
4235 do_div(stripe_nr, map->stripe_len); 4400 do_div(stripe_nr, stripe_len);
4236 4401
4237 stripe_offset = stripe_nr * map->stripe_len; 4402 stripe_offset = stripe_nr * stripe_len;
4238 BUG_ON(offset < stripe_offset); 4403 BUG_ON(offset < stripe_offset);
4239 4404
4240 /* stripe_offset is the offset of this block in its stripe*/ 4405 /* stripe_offset is the offset of this block in its stripe*/
4241 stripe_offset = offset - stripe_offset; 4406 stripe_offset = offset - stripe_offset;
4242 4407
4243 if (rw & REQ_DISCARD) 4408 /* if we're here for raid56, we need to know the stripe aligned start */
4409 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4410 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4411 raid56_full_stripe_start = offset;
4412
4413 /* allow a write of a full stripe, but make sure we don't
4414 * allow straddling of stripes
4415 */
4416 do_div(raid56_full_stripe_start, full_stripe_len);
4417 raid56_full_stripe_start *= full_stripe_len;
4418 }
4419
4420 if (rw & REQ_DISCARD) {
4421 /* we don't discard raid56 yet */
4422 if (map->type &
4423 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4424 ret = -EOPNOTSUPP;
4425 goto out;
4426 }
4244 *length = min_t(u64, em->len - offset, *length); 4427 *length = min_t(u64, em->len - offset, *length);
4245 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4428 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4246 /* we limit the length of each bio to what fits in a stripe */ 4429 u64 max_len;
4247 *length = min_t(u64, em->len - offset, 4430 /* For writes to RAID[56], allow a full stripeset across all disks.
4248 map->stripe_len - stripe_offset); 4431 For other RAID types and for RAID[56] reads, just allow a single
4432 stripe (on a single disk). */
4433 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4434 (rw & REQ_WRITE)) {
4435 max_len = stripe_len * nr_data_stripes(map) -
4436 (offset - raid56_full_stripe_start);
4437 } else {
4438 /* we limit the length of each bio to what fits in a stripe */
4439 max_len = stripe_len - stripe_offset;
4440 }
4441 *length = min_t(u64, em->len - offset, max_len);
4249 } else { 4442 } else {
4250 *length = em->len - offset; 4443 *length = em->len - offset;
4251 } 4444 }
4252 4445
4446 /* This is for when we're called from btrfs_merge_bio_hook() and all
4447 it cares about is the length */
4253 if (!bbio_ret) 4448 if (!bbio_ret)
4254 goto out; 4449 goto out;
4255 4450
@@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4282 u64 physical_of_found = 0; 4477 u64 physical_of_found = 0;
4283 4478
4284 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4479 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4285 logical, &tmp_length, &tmp_bbio, 0); 4480 logical, &tmp_length, &tmp_bbio, 0, NULL);
4286 if (ret) { 4481 if (ret) {
4287 WARN_ON(tmp_bbio != NULL); 4482 WARN_ON(tmp_bbio != NULL);
4288 goto out; 4483 goto out;
@@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4348 do_div(stripe_nr_end, map->stripe_len); 4543 do_div(stripe_nr_end, map->stripe_len);
4349 stripe_end_offset = stripe_nr_end * map->stripe_len - 4544 stripe_end_offset = stripe_nr_end * map->stripe_len -
4350 (offset + *length); 4545 (offset + *length);
4546
4351 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4547 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4352 if (rw & REQ_DISCARD) 4548 if (rw & REQ_DISCARD)
4353 num_stripes = min_t(u64, map->num_stripes, 4549 num_stripes = min_t(u64, map->num_stripes,
@@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4398 dev_replace_is_ongoing); 4594 dev_replace_is_ongoing);
4399 mirror_num = stripe_index - old_stripe_index + 1; 4595 mirror_num = stripe_index - old_stripe_index + 1;
4400 } 4596 }
4597
4598 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4599 BTRFS_BLOCK_GROUP_RAID6)) {
4600 u64 tmp;
4601
4602 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4603 && raid_map_ret) {
4604 int i, rot;
4605
4606 /* push stripe_nr back to the start of the full stripe */
4607 stripe_nr = raid56_full_stripe_start;
4608 do_div(stripe_nr, stripe_len);
4609
4610 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4611
4612 /* RAID[56] write or recovery. Return all stripes */
4613 num_stripes = map->num_stripes;
4614 max_errors = nr_parity_stripes(map);
4615
4616 raid_map = kmalloc(sizeof(u64) * num_stripes,
4617 GFP_NOFS);
4618 if (!raid_map) {
4619 ret = -ENOMEM;
4620 goto out;
4621 }
4622
4623 /* Work out the disk rotation on this stripe-set */
4624 tmp = stripe_nr;
4625 rot = do_div(tmp, num_stripes);
4626
4627 /* Fill in the logical address of each stripe */
4628 tmp = stripe_nr * nr_data_stripes(map);
4629 for (i = 0; i < nr_data_stripes(map); i++)
4630 raid_map[(i+rot) % num_stripes] =
4631 em->start + (tmp + i) * map->stripe_len;
4632
4633 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4634 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4635 raid_map[(i+rot+1) % num_stripes] =
4636 RAID6_Q_STRIPE;
4637
4638 *length = map->stripe_len;
4639 stripe_index = 0;
4640 stripe_offset = 0;
4641 } else {
4642 /*
4643 * Mirror #0 or #1 means the original data block.
4644 * Mirror #2 is RAID5 parity block.
4645 * Mirror #3 is RAID6 Q block.
4646 */
4647 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4648 if (mirror_num > 1)
4649 stripe_index = nr_data_stripes(map) +
4650 mirror_num - 2;
4651
4652 /* We distribute the parity blocks across stripes */
4653 tmp = stripe_nr + stripe_index;
4654 stripe_index = do_div(tmp, map->num_stripes);
4655 }
4401 } else { 4656 } else {
4402 /* 4657 /*
4403 * after this do_div call, stripe_nr is the number of stripes 4658 * after this do_div call, stripe_nr is the number of stripes
@@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4506 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4761 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4507 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4762 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4508 BTRFS_BLOCK_GROUP_RAID10 | 4763 BTRFS_BLOCK_GROUP_RAID10 |
4764 BTRFS_BLOCK_GROUP_RAID5 |
4509 BTRFS_BLOCK_GROUP_DUP)) { 4765 BTRFS_BLOCK_GROUP_DUP)) {
4510 max_errors = 1; 4766 max_errors = 1;
4767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4768 max_errors = 2;
4511 } 4769 }
4512 } 4770 }
4513 4771
@@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4608 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4866 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4609 bbio->mirror_num = map->num_stripes + 1; 4867 bbio->mirror_num = map->num_stripes + 1;
4610 } 4868 }
4869 if (raid_map) {
4870 sort_parity_stripes(bbio, raid_map);
4871 *raid_map_ret = raid_map;
4872 }
4611out: 4873out:
4612 if (dev_replace_is_ongoing) 4874 if (dev_replace_is_ongoing)
4613 btrfs_dev_replace_unlock(dev_replace); 4875 btrfs_dev_replace_unlock(dev_replace);
@@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4620 struct btrfs_bio **bbio_ret, int mirror_num) 4882 struct btrfs_bio **bbio_ret, int mirror_num)
4621{ 4883{
4622 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4884 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4623 mirror_num); 4885 mirror_num, NULL);
4624} 4886}
4625 4887
4626int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4888int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4634 u64 bytenr; 4896 u64 bytenr;
4635 u64 length; 4897 u64 length;
4636 u64 stripe_nr; 4898 u64 stripe_nr;
4899 u64 rmap_len;
4637 int i, j, nr = 0; 4900 int i, j, nr = 0;
4638 4901
4639 read_lock(&em_tree->lock); 4902 read_lock(&em_tree->lock);
@@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4644 map = (struct map_lookup *)em->bdev; 4907 map = (struct map_lookup *)em->bdev;
4645 4908
4646 length = em->len; 4909 length = em->len;
4910 rmap_len = map->stripe_len;
4911
4647 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4912 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4648 do_div(length, map->num_stripes / map->sub_stripes); 4913 do_div(length, map->num_stripes / map->sub_stripes);
4649 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4914 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4650 do_div(length, map->num_stripes); 4915 do_div(length, map->num_stripes);
4916 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4917 BTRFS_BLOCK_GROUP_RAID6)) {
4918 do_div(length, nr_data_stripes(map));
4919 rmap_len = map->stripe_len * nr_data_stripes(map);
4920 }
4651 4921
4652 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4922 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4653 BUG_ON(!buf); /* -ENOMEM */ 4923 BUG_ON(!buf); /* -ENOMEM */
@@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4667 do_div(stripe_nr, map->sub_stripes); 4937 do_div(stripe_nr, map->sub_stripes);
4668 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4938 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4669 stripe_nr = stripe_nr * map->num_stripes + i; 4939 stripe_nr = stripe_nr * map->num_stripes + i;
4670 } 4940 } /* else if RAID[56], multiply by nr_data_stripes().
4671 bytenr = chunk_start + stripe_nr * map->stripe_len; 4941 * Alternatively, just use rmap_len below instead of
4942 * map->stripe_len */
4943
4944 bytenr = chunk_start + stripe_nr * rmap_len;
4672 WARN_ON(nr >= map->num_stripes); 4945 WARN_ON(nr >= map->num_stripes);
4673 for (j = 0; j < nr; j++) { 4946 for (j = 0; j < nr; j++) {
4674 if (buf[j] == bytenr) 4947 if (buf[j] == bytenr)
@@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4682 4955
4683 *logical = buf; 4956 *logical = buf;
4684 *naddrs = nr; 4957 *naddrs = nr;
4685 *stripe_len = map->stripe_len; 4958 *stripe_len = rmap_len;
4686 4959
4687 free_extent_map(em); 4960 free_extent_map(em);
4688 return 0; 4961 return 0;
@@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4756 bio->bi_bdev = (struct block_device *) 5029 bio->bi_bdev = (struct block_device *)
4757 (unsigned long)bbio->mirror_num; 5030 (unsigned long)bbio->mirror_num;
4758 /* only send an error to the higher layers if it is 5031 /* only send an error to the higher layers if it is
4759 * beyond the tolerance of the multi-bio 5032 * beyond the tolerance of the btrfs bio
4760 */ 5033 */
4761 if (atomic_read(&bbio->error) > bbio->max_errors) { 5034 if (atomic_read(&bbio->error) > bbio->max_errors) {
4762 err = -EIO; 5035 err = -EIO;
@@ -4790,13 +5063,18 @@ struct async_sched {
4790 * This will add one bio to the pending list for a device and make sure 5063 * This will add one bio to the pending list for a device and make sure
4791 * the work struct is scheduled. 5064 * the work struct is scheduled.
4792 */ 5065 */
4793static noinline void schedule_bio(struct btrfs_root *root, 5066noinline void btrfs_schedule_bio(struct btrfs_root *root,
4794 struct btrfs_device *device, 5067 struct btrfs_device *device,
4795 int rw, struct bio *bio) 5068 int rw, struct bio *bio)
4796{ 5069{
4797 int should_queue = 1; 5070 int should_queue = 1;
4798 struct btrfs_pending_bios *pending_bios; 5071 struct btrfs_pending_bios *pending_bios;
4799 5072
5073 if (device->missing || !device->bdev) {
5074 bio_endio(bio, -EIO);
5075 return;
5076 }
5077
4800 /* don't bother with additional async steps for reads, right now */ 5078 /* don't bother with additional async steps for reads, right now */
4801 if (!(rw & REQ_WRITE)) { 5079 if (!(rw & REQ_WRITE)) {
4802 bio_get(bio); 5080 bio_get(bio);
@@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4894#endif 5172#endif
4895 bio->bi_bdev = dev->bdev; 5173 bio->bi_bdev = dev->bdev;
4896 if (async) 5174 if (async)
4897 schedule_bio(root, dev, rw, bio); 5175 btrfs_schedule_bio(root, dev, rw, bio);
4898 else 5176 else
4899 btrfsic_submit_bio(rw, bio); 5177 btrfsic_submit_bio(rw, bio);
4900} 5178}
@@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4953 u64 logical = (u64)bio->bi_sector << 9; 5231 u64 logical = (u64)bio->bi_sector << 9;
4954 u64 length = 0; 5232 u64 length = 0;
4955 u64 map_length; 5233 u64 map_length;
5234 u64 *raid_map = NULL;
4956 int ret; 5235 int ret;
4957 int dev_nr = 0; 5236 int dev_nr = 0;
4958 int total_devs = 1; 5237 int total_devs = 1;
@@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4961 length = bio->bi_size; 5240 length = bio->bi_size;
4962 map_length = length; 5241 map_length = length;
4963 5242
4964 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5243 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4965 mirror_num); 5244 mirror_num, &raid_map);
4966 if (ret) 5245 if (ret) /* -ENOMEM */
4967 return ret; 5246 return ret;
4968 5247
4969 total_devs = bbio->num_stripes; 5248 total_devs = bbio->num_stripes;
5249 bbio->orig_bio = first_bio;
5250 bbio->private = first_bio->bi_private;
5251 bbio->end_io = first_bio->bi_end_io;
5252 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5253
5254 if (raid_map) {
5255 /* In this case, map_length has been set to the length of
5256 a single stripe; not the whole write */
5257 if (rw & WRITE) {
5258 return raid56_parity_write(root, bio, bbio,
5259 raid_map, map_length);
5260 } else {
5261 return raid56_parity_recover(root, bio, bbio,
5262 raid_map, map_length,
5263 mirror_num);
5264 }
5265 }
5266
4970 if (map_length < length) { 5267 if (map_length < length) {
4971 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5268 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4972 "len %llu\n", (unsigned long long)logical, 5269 "len %llu\n", (unsigned long long)logical,
@@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4975 BUG(); 5272 BUG();
4976 } 5273 }
4977 5274
4978 bbio->orig_bio = first_bio;
4979 bbio->private = first_bio->bi_private;
4980 bbio->end_io = first_bio->bi_end_io;
4981 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4982
4983 while (dev_nr < total_devs) { 5275 while (dev_nr < total_devs) {
4984 dev = bbio->stripes[dev_nr].dev; 5276 dev = bbio->stripes[dev_nr].dev;
4985 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5277 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {