aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2013-01-29 18:40:14 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit53b381b3abeb86f12787a6c40fee9b2f71edc23b (patch)
treec1018ba2157778f0200d2ede0c0df48fe5df8f14 /fs/btrfs/volumes.c
parent64a167011bcabc1e855658387c8a4464b71f3138 (diff)
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c385
1 files changed, 344 insertions, 41 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 485a5423e3c6..c372264b85bf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -1389,6 +1392,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1389 } 1392 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1393 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391 1394
1395 if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 |
1396 BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) {
1397 printk(KERN_ERR "btrfs: unable to go below three devices "
1398 "on raid5 or raid6\n");
1399 ret = -EINVAL;
1400 goto out;
1401 }
1402
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1403 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1393 printk(KERN_ERR "btrfs: unable to go below four devices " 1404 printk(KERN_ERR "btrfs: unable to go below four devices "
1394 "on raid10\n"); 1405 "on raid10\n");
@@ -1403,6 +1414,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1414 goto out;
1404 } 1415 }
1405 1416
1417 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1418 root->fs_info->fs_devices->rw_devices <= 2) {
1419 printk(KERN_ERR "btrfs: unable to go below two "
1420 "devices on raid5\n");
1421 ret = -EINVAL;
1422 goto out;
1423 }
1424 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1425 root->fs_info->fs_devices->rw_devices <= 3) {
1426 printk(KERN_ERR "btrfs: unable to go below three "
1427 "devices on raid6\n");
1428 ret = -EINVAL;
1429 goto out;
1430 }
1431
1406 if (strcmp(device_path, "missing") == 0) { 1432 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1433 struct list_head *devices;
1408 struct btrfs_device *tmp; 1434 struct btrfs_device *tmp;
@@ -2657,11 +2683,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2657 return 0; 2683 return 0;
2658 2684
2659 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2685 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2660 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2686 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2661 factor = 2; 2687 factor = num_stripes / 2;
2662 else 2688 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2663 factor = 1; 2689 factor = num_stripes - 1;
2664 factor = num_stripes / factor; 2690 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2691 factor = num_stripes - 2;
2692 } else {
2693 factor = num_stripes;
2694 }
2665 2695
2666 for (i = 0; i < num_stripes; i++) { 2696 for (i = 0; i < num_stripes; i++) {
2667 stripe = btrfs_stripe_nr(chunk, i); 2697 stripe = btrfs_stripe_nr(chunk, i);
@@ -2976,6 +3006,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2976 int mixed = 0; 3006 int mixed = 0;
2977 int ret; 3007 int ret;
2978 u64 num_devices; 3008 u64 num_devices;
3009 int cancel = 0;
2979 3010
2980 if (btrfs_fs_closing(fs_info) || 3011 if (btrfs_fs_closing(fs_info) ||
2981 atomic_read(&fs_info->balance_pause_req) || 3012 atomic_read(&fs_info->balance_pause_req) ||
@@ -3018,7 +3049,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3018 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3049 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3019 else 3050 else
3020 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3051 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3021 BTRFS_BLOCK_GROUP_RAID10); 3052 BTRFS_BLOCK_GROUP_RAID10 |
3053 BTRFS_BLOCK_GROUP_RAID5 |
3054 BTRFS_BLOCK_GROUP_RAID6);
3022 3055
3023 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3056 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3024 (!alloc_profile_is_valid(bctl->data.target, 1) || 3057 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3058,7 +3091,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3058 3091
3059 /* allow to reduce meta or sys integrity only if force set */ 3092 /* allow to reduce meta or sys integrity only if force set */
3060 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3093 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3061 BTRFS_BLOCK_GROUP_RAID10; 3094 BTRFS_BLOCK_GROUP_RAID10 |
3095 BTRFS_BLOCK_GROUP_RAID5 |
3096 BTRFS_BLOCK_GROUP_RAID6;
3097
3062 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3098 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3063 (fs_info->avail_system_alloc_bits & allowed) && 3099 (fs_info->avail_system_alloc_bits & allowed) &&
3064 !(bctl->sys.target & allowed)) || 3100 !(bctl->sys.target & allowed)) ||
@@ -3124,15 +3160,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3124 } 3160 }
3125 3161
3126 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3162 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3127 balance_need_close(fs_info)) { 3163 balance_need_close(fs_info))
3128 __cancel_balance(fs_info); 3164 cancel = 1;
3129 }
3130 3165
3131 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3166 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3132 fs_info->num_tolerated_disk_barrier_failures = 3167 fs_info->num_tolerated_disk_barrier_failures =
3133 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3168 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3134 } 3169 }
3135 3170
3171 if (cancel)
3172 __cancel_balance(fs_info);
3173
3136 wake_up(&fs_info->balance_wait_q); 3174 wake_up(&fs_info->balance_wait_q);
3137 3175
3138 return ret; 3176 return ret;
@@ -3493,13 +3531,45 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3493} 3531}
3494 3532
3495struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3533struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3534 /*
3535 * sub_stripes info for map,
3536 * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise
3537 * devs_max -- max devices per stripe, 0 for unlimited
3538 * devs_min -- min devices per stripe
3539 * devs_increment -- ndevs must be a multiple of this
3540 * ncopies -- how many copies of the data we have
3541 */
3496 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3542 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3497 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3543 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3498 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3544 { 1, 2, 1, 1, 1, 2 /* dup */ },
3499 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3545 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3500 { 1, 1, 0, 1, 1, 1 /* single */ }, 3546 { 1, 1, 0, 1, 1, 1 /* single */ },
3547 { 1, 1, 0, 2, 1, 2 /* raid5 */ },
3548 { 1, 1, 0, 3, 1, 3 /* raid6 */ },
3501}; 3549};
3502 3550
3551static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3552{
3553 /* TODO allow them to set a preferred stripe size */
3554 return 64 * 1024;
3555}
3556
3557static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3558{
3559 u64 features;
3560
3561 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3562 return;
3563
3564 features = btrfs_super_incompat_flags(info->super_copy);
3565 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3566 return;
3567
3568 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3569 btrfs_set_super_incompat_flags(info->super_copy, features);
3570 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3571}
3572
3503static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3573static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *extent_root, 3574 struct btrfs_root *extent_root,
3505 struct map_lookup **map_ret, 3575 struct map_lookup **map_ret,
@@ -3515,6 +3585,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_device_info *devices_info = NULL; 3585 struct btrfs_device_info *devices_info = NULL;
3516 u64 total_avail; 3586 u64 total_avail;
3517 int num_stripes; /* total number of stripes to allocate */ 3587 int num_stripes; /* total number of stripes to allocate */
3588 int data_stripes; /* number of stripes that count for
3589 block group size */
3518 int sub_stripes; /* sub_stripes info for map */ 3590 int sub_stripes; /* sub_stripes info for map */
3519 int dev_stripes; /* stripes per dev */ 3591 int dev_stripes; /* stripes per dev */
3520 int devs_max; /* max devs to use */ 3592 int devs_max; /* max devs to use */
@@ -3526,6 +3598,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 u64 max_chunk_size; 3598 u64 max_chunk_size;
3527 u64 stripe_size; 3599 u64 stripe_size;
3528 u64 num_bytes; 3600 u64 num_bytes;
3601 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3529 int ndevs; 3602 int ndevs;
3530 int i; 3603 int i;
3531 int j; 3604 int j;
@@ -3651,16 +3724,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3651 stripe_size = devices_info[ndevs-1].max_avail; 3724 stripe_size = devices_info[ndevs-1].max_avail;
3652 num_stripes = ndevs * dev_stripes; 3725 num_stripes = ndevs * dev_stripes;
3653 3726
3727 /*
3728 * this will have to be fixed for RAID1 and RAID10 over
3729 * more drives
3730 */
3731 data_stripes = num_stripes / ncopies;
3732
3654 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3733 if (stripe_size * ndevs > max_chunk_size * ncopies) {
3655 stripe_size = max_chunk_size * ncopies; 3734 stripe_size = max_chunk_size * ncopies;
3656 do_div(stripe_size, ndevs); 3735 do_div(stripe_size, ndevs);
3657 } 3736 }
3658 3737 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3738 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3739 btrfs_super_stripesize(info->super_copy));
3740 data_stripes = num_stripes - 1;
3741 }
3742 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3743 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3744 btrfs_super_stripesize(info->super_copy));
3745 data_stripes = num_stripes - 2;
3746 }
3659 do_div(stripe_size, dev_stripes); 3747 do_div(stripe_size, dev_stripes);
3660 3748
3661 /* align to BTRFS_STRIPE_LEN */ 3749 /* align to BTRFS_STRIPE_LEN */
3662 do_div(stripe_size, BTRFS_STRIPE_LEN); 3750 do_div(stripe_size, raid_stripe_len);
3663 stripe_size *= BTRFS_STRIPE_LEN; 3751 stripe_size *= raid_stripe_len;
3664 3752
3665 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3753 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3666 if (!map) { 3754 if (!map) {
@@ -3678,14 +3766,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3678 } 3766 }
3679 } 3767 }
3680 map->sector_size = extent_root->sectorsize; 3768 map->sector_size = extent_root->sectorsize;
3681 map->stripe_len = BTRFS_STRIPE_LEN; 3769 map->stripe_len = raid_stripe_len;
3682 map->io_align = BTRFS_STRIPE_LEN; 3770 map->io_align = raid_stripe_len;
3683 map->io_width = BTRFS_STRIPE_LEN; 3771 map->io_width = raid_stripe_len;
3684 map->type = type; 3772 map->type = type;
3685 map->sub_stripes = sub_stripes; 3773 map->sub_stripes = sub_stripes;
3686 3774
3687 *map_ret = map; 3775 *map_ret = map;
3688 num_bytes = stripe_size * (num_stripes / ncopies); 3776 num_bytes = stripe_size * data_stripes;
3689 3777
3690 *stripe_size_out = stripe_size; 3778 *stripe_size_out = stripe_size;
3691 *num_bytes_out = num_bytes; 3779 *num_bytes_out = num_bytes;
@@ -3734,6 +3822,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3734 } 3822 }
3735 } 3823 }
3736 3824
3825 check_raid56_incompat_flag(extent_root->fs_info, type);
3826
3737 kfree(devices_info); 3827 kfree(devices_info);
3738 return 0; 3828 return 0;
3739 3829
@@ -4003,6 +4093,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4003 ret = map->num_stripes; 4093 ret = map->num_stripes;
4004 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4094 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4005 ret = map->sub_stripes; 4095 ret = map->sub_stripes;
4096 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4097 ret = 2;
4098 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4099 ret = 3;
4006 else 4100 else
4007 ret = 1; 4101 ret = 1;
4008 free_extent_map(em); 4102 free_extent_map(em);
@@ -4015,6 +4109,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4015 return ret; 4109 return ret;
4016} 4110}
4017 4111
4112unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4113 struct btrfs_mapping_tree *map_tree,
4114 u64 logical)
4115{
4116 struct extent_map *em;
4117 struct map_lookup *map;
4118 struct extent_map_tree *em_tree = &map_tree->map_tree;
4119 unsigned long len = root->sectorsize;
4120
4121 read_lock(&em_tree->lock);
4122 em = lookup_extent_mapping(em_tree, logical, len);
4123 read_unlock(&em_tree->lock);
4124 BUG_ON(!em);
4125
4126 BUG_ON(em->start > logical || em->start + em->len < logical);
4127 map = (struct map_lookup *)em->bdev;
4128 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4129 BTRFS_BLOCK_GROUP_RAID6)) {
4130 len = map->stripe_len * nr_data_stripes(map);
4131 }
4132 free_extent_map(em);
4133 return len;
4134}
4135
4136int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4137 u64 logical, u64 len, int mirror_num)
4138{
4139 struct extent_map *em;
4140 struct map_lookup *map;
4141 struct extent_map_tree *em_tree = &map_tree->map_tree;
4142 int ret = 0;
4143
4144 read_lock(&em_tree->lock);
4145 em = lookup_extent_mapping(em_tree, logical, len);
4146 read_unlock(&em_tree->lock);
4147 BUG_ON(!em);
4148
4149 BUG_ON(em->start > logical || em->start + em->len < logical);
4150 map = (struct map_lookup *)em->bdev;
4151 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4152 BTRFS_BLOCK_GROUP_RAID6))
4153 ret = 1;
4154 free_extent_map(em);
4155 return ret;
4156}
4157
4018static int find_live_mirror(struct btrfs_fs_info *fs_info, 4158static int find_live_mirror(struct btrfs_fs_info *fs_info,
4019 struct map_lookup *map, int first, int num, 4159 struct map_lookup *map, int first, int num,
4020 int optimal, int dev_replace_is_ongoing) 4160 int optimal, int dev_replace_is_ongoing)
@@ -4052,10 +4192,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4052 return optimal; 4192 return optimal;
4053} 4193}
4054 4194
4195static inline int parity_smaller(u64 a, u64 b)
4196{
4197 return a > b;
4198}
4199
4200/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4201static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4202{
4203 struct btrfs_bio_stripe s;
4204 int i;
4205 u64 l;
4206 int again = 1;
4207
4208 while (again) {
4209 again = 0;
4210 for (i = 0; i < bbio->num_stripes - 1; i++) {
4211 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4212 s = bbio->stripes[i];
4213 l = raid_map[i];
4214 bbio->stripes[i] = bbio->stripes[i+1];
4215 raid_map[i] = raid_map[i+1];
4216 bbio->stripes[i+1] = s;
4217 raid_map[i+1] = l;
4218 again = 1;
4219 }
4220 }
4221 }
4222}
4223
4055static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4224static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4056 u64 logical, u64 *length, 4225 u64 logical, u64 *length,
4057 struct btrfs_bio **bbio_ret, 4226 struct btrfs_bio **bbio_ret,
4058 int mirror_num) 4227 int mirror_num, u64 **raid_map_ret)
4059{ 4228{
4060 struct extent_map *em; 4229 struct extent_map *em;
4061 struct map_lookup *map; 4230 struct map_lookup *map;
@@ -4067,6 +4236,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 stripe_nr; 4236 u64 stripe_nr;
4068 u64 stripe_nr_orig; 4237 u64 stripe_nr_orig;
4069 u64 stripe_nr_end; 4238 u64 stripe_nr_end;
4239 u64 stripe_len;
4240 u64 *raid_map = NULL;
4070 int stripe_index; 4241 int stripe_index;
4071 int i; 4242 int i;
4072 int ret = 0; 4243 int ret = 0;
@@ -4078,6 +4249,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 int num_alloc_stripes; 4249 int num_alloc_stripes;
4079 int patch_the_first_stripe_for_dev_replace = 0; 4250 int patch_the_first_stripe_for_dev_replace = 0;
4080 u64 physical_to_patch_in_first_stripe = 0; 4251 u64 physical_to_patch_in_first_stripe = 0;
4252 u64 raid56_full_stripe_start = (u64)-1;
4081 4253
4082 read_lock(&em_tree->lock); 4254 read_lock(&em_tree->lock);
4083 em = lookup_extent_mapping(em_tree, logical, *length); 4255 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4094,29 +4266,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4094 map = (struct map_lookup *)em->bdev; 4266 map = (struct map_lookup *)em->bdev;
4095 offset = logical - em->start; 4267 offset = logical - em->start;
4096 4268
4269 if (mirror_num > map->num_stripes)
4270 mirror_num = 0;
4271
4272 stripe_len = map->stripe_len;
4097 stripe_nr = offset; 4273 stripe_nr = offset;
4098 /* 4274 /*
4099 * stripe_nr counts the total number of stripes we have to stride 4275 * stripe_nr counts the total number of stripes we have to stride
4100 * to get to this block 4276 * to get to this block
4101 */ 4277 */
4102 do_div(stripe_nr, map->stripe_len); 4278 do_div(stripe_nr, stripe_len);
4103 4279
4104 stripe_offset = stripe_nr * map->stripe_len; 4280 stripe_offset = stripe_nr * stripe_len;
4105 BUG_ON(offset < stripe_offset); 4281 BUG_ON(offset < stripe_offset);
4106 4282
4107 /* stripe_offset is the offset of this block in its stripe*/ 4283 /* stripe_offset is the offset of this block in its stripe*/
4108 stripe_offset = offset - stripe_offset; 4284 stripe_offset = offset - stripe_offset;
4109 4285
4110 if (rw & REQ_DISCARD) 4286 /* if we're here for raid56, we need to know the stripe aligned start */
4287 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4288 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4289 raid56_full_stripe_start = offset;
4290
4291 /* allow a write of a full stripe, but make sure we don't
4292 * allow straddling of stripes
4293 */
4294 do_div(raid56_full_stripe_start, full_stripe_len);
4295 raid56_full_stripe_start *= full_stripe_len;
4296 }
4297
4298 if (rw & REQ_DISCARD) {
4299 /* we don't discard raid56 yet */
4300 if (map->type &
4301 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4302 ret = -EOPNOTSUPP;
4303 goto out;
4304 }
4111 *length = min_t(u64, em->len - offset, *length); 4305 *length = min_t(u64, em->len - offset, *length);
4112 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4306 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4113 /* we limit the length of each bio to what fits in a stripe */ 4307 u64 max_len;
4114 *length = min_t(u64, em->len - offset, 4308 /* For writes to RAID[56], allow a full stripeset across all disks.
4115 map->stripe_len - stripe_offset); 4309 For other RAID types and for RAID[56] reads, just allow a single
4310 stripe (on a single disk). */
4311 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4312 (rw & REQ_WRITE)) {
4313 max_len = stripe_len * nr_data_stripes(map) -
4314 (offset - raid56_full_stripe_start);
4315 } else {
4316 /* we limit the length of each bio to what fits in a stripe */
4317 max_len = stripe_len - stripe_offset;
4318 }
4319 *length = min_t(u64, em->len - offset, max_len);
4116 } else { 4320 } else {
4117 *length = em->len - offset; 4321 *length = em->len - offset;
4118 } 4322 }
4119 4323
4324 /* This is for when we're called from btrfs_merge_bio_hook() and all
4325 it cares about is the length */
4120 if (!bbio_ret) 4326 if (!bbio_ret)
4121 goto out; 4327 goto out;
4122 4328
@@ -4149,7 +4355,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4149 u64 physical_of_found = 0; 4355 u64 physical_of_found = 0;
4150 4356
4151 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4357 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4152 logical, &tmp_length, &tmp_bbio, 0); 4358 logical, &tmp_length, &tmp_bbio, 0, NULL);
4153 if (ret) { 4359 if (ret) {
4154 WARN_ON(tmp_bbio != NULL); 4360 WARN_ON(tmp_bbio != NULL);
4155 goto out; 4361 goto out;
@@ -4215,6 +4421,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4215 do_div(stripe_nr_end, map->stripe_len); 4421 do_div(stripe_nr_end, map->stripe_len);
4216 stripe_end_offset = stripe_nr_end * map->stripe_len - 4422 stripe_end_offset = stripe_nr_end * map->stripe_len -
4217 (offset + *length); 4423 (offset + *length);
4424
4218 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4425 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4219 if (rw & REQ_DISCARD) 4426 if (rw & REQ_DISCARD)
4220 num_stripes = min_t(u64, map->num_stripes, 4427 num_stripes = min_t(u64, map->num_stripes,
@@ -4265,6 +4472,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4265 dev_replace_is_ongoing); 4472 dev_replace_is_ongoing);
4266 mirror_num = stripe_index - old_stripe_index + 1; 4473 mirror_num = stripe_index - old_stripe_index + 1;
4267 } 4474 }
4475
4476 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4477 BTRFS_BLOCK_GROUP_RAID6)) {
4478 u64 tmp;
4479
4480 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4481 && raid_map_ret) {
4482 int i, rot;
4483
4484 /* push stripe_nr back to the start of the full stripe */
4485 stripe_nr = raid56_full_stripe_start;
4486 do_div(stripe_nr, stripe_len);
4487
4488 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4489
4490 /* RAID[56] write or recovery. Return all stripes */
4491 num_stripes = map->num_stripes;
4492 max_errors = nr_parity_stripes(map);
4493
4494 raid_map = kmalloc(sizeof(u64) * num_stripes,
4495 GFP_NOFS);
4496 if (!raid_map) {
4497 ret = -ENOMEM;
4498 goto out;
4499 }
4500
4501 /* Work out the disk rotation on this stripe-set */
4502 tmp = stripe_nr;
4503 rot = do_div(tmp, num_stripes);
4504
4505 /* Fill in the logical address of each stripe */
4506 tmp = stripe_nr * nr_data_stripes(map);
4507 for (i = 0; i < nr_data_stripes(map); i++)
4508 raid_map[(i+rot) % num_stripes] =
4509 em->start + (tmp + i) * map->stripe_len;
4510
4511 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4512 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4513 raid_map[(i+rot+1) % num_stripes] =
4514 RAID6_Q_STRIPE;
4515
4516 *length = map->stripe_len;
4517 stripe_index = 0;
4518 stripe_offset = 0;
4519 } else {
4520 /*
4521 * Mirror #0 or #1 means the original data block.
4522 * Mirror #2 is RAID5 parity block.
4523 * Mirror #3 is RAID6 Q block.
4524 */
4525 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4526 if (mirror_num > 1)
4527 stripe_index = nr_data_stripes(map) +
4528 mirror_num - 2;
4529
4530 /* We distribute the parity blocks across stripes */
4531 tmp = stripe_nr + stripe_index;
4532 stripe_index = do_div(tmp, map->num_stripes);
4533 }
4268 } else { 4534 } else {
4269 /* 4535 /*
4270 * after this do_div call, stripe_nr is the number of stripes 4536 * after this do_div call, stripe_nr is the number of stripes
@@ -4373,8 +4639,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4373 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4639 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4374 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4640 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4375 BTRFS_BLOCK_GROUP_RAID10 | 4641 BTRFS_BLOCK_GROUP_RAID10 |
4642 BTRFS_BLOCK_GROUP_RAID5 |
4376 BTRFS_BLOCK_GROUP_DUP)) { 4643 BTRFS_BLOCK_GROUP_DUP)) {
4377 max_errors = 1; 4644 max_errors = 1;
4645 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4646 max_errors = 2;
4378 } 4647 }
4379 } 4648 }
4380 4649
@@ -4475,6 +4744,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4475 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4744 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4476 bbio->mirror_num = map->num_stripes + 1; 4745 bbio->mirror_num = map->num_stripes + 1;
4477 } 4746 }
4747 if (raid_map) {
4748 sort_parity_stripes(bbio, raid_map);
4749 *raid_map_ret = raid_map;
4750 }
4478out: 4751out:
4479 if (dev_replace_is_ongoing) 4752 if (dev_replace_is_ongoing)
4480 btrfs_dev_replace_unlock(dev_replace); 4753 btrfs_dev_replace_unlock(dev_replace);
@@ -4487,7 +4760,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4487 struct btrfs_bio **bbio_ret, int mirror_num) 4760 struct btrfs_bio **bbio_ret, int mirror_num)
4488{ 4761{
4489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4762 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4490 mirror_num); 4763 mirror_num, NULL);
4491} 4764}
4492 4765
4493int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4766int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4501,6 +4774,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4501 u64 bytenr; 4774 u64 bytenr;
4502 u64 length; 4775 u64 length;
4503 u64 stripe_nr; 4776 u64 stripe_nr;
4777 u64 rmap_len;
4504 int i, j, nr = 0; 4778 int i, j, nr = 0;
4505 4779
4506 read_lock(&em_tree->lock); 4780 read_lock(&em_tree->lock);
@@ -4511,10 +4785,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4511 map = (struct map_lookup *)em->bdev; 4785 map = (struct map_lookup *)em->bdev;
4512 4786
4513 length = em->len; 4787 length = em->len;
4788 rmap_len = map->stripe_len;
4789
4514 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4790 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4515 do_div(length, map->num_stripes / map->sub_stripes); 4791 do_div(length, map->num_stripes / map->sub_stripes);
4516 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4792 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4517 do_div(length, map->num_stripes); 4793 do_div(length, map->num_stripes);
4794 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4795 BTRFS_BLOCK_GROUP_RAID6)) {
4796 do_div(length, nr_data_stripes(map));
4797 rmap_len = map->stripe_len * nr_data_stripes(map);
4798 }
4518 4799
4519 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4800 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4520 BUG_ON(!buf); /* -ENOMEM */ 4801 BUG_ON(!buf); /* -ENOMEM */
@@ -4534,8 +4815,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4534 do_div(stripe_nr, map->sub_stripes); 4815 do_div(stripe_nr, map->sub_stripes);
4535 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4816 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4536 stripe_nr = stripe_nr * map->num_stripes + i; 4817 stripe_nr = stripe_nr * map->num_stripes + i;
4537 } 4818 } /* else if RAID[56], multiply by nr_data_stripes().
4538 bytenr = chunk_start + stripe_nr * map->stripe_len; 4819 * Alternatively, just use rmap_len below instead of
4820 * map->stripe_len */
4821
4822 bytenr = chunk_start + stripe_nr * rmap_len;
4539 WARN_ON(nr >= map->num_stripes); 4823 WARN_ON(nr >= map->num_stripes);
4540 for (j = 0; j < nr; j++) { 4824 for (j = 0; j < nr; j++) {
4541 if (buf[j] == bytenr) 4825 if (buf[j] == bytenr)
@@ -4549,7 +4833,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4549 4833
4550 *logical = buf; 4834 *logical = buf;
4551 *naddrs = nr; 4835 *naddrs = nr;
4552 *stripe_len = map->stripe_len; 4836 *stripe_len = rmap_len;
4553 4837
4554 free_extent_map(em); 4838 free_extent_map(em);
4555 return 0; 4839 return 0;
@@ -4623,7 +4907,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4623 bio->bi_bdev = (struct block_device *) 4907 bio->bi_bdev = (struct block_device *)
4624 (unsigned long)bbio->mirror_num; 4908 (unsigned long)bbio->mirror_num;
4625 /* only send an error to the higher layers if it is 4909 /* only send an error to the higher layers if it is
4626 * beyond the tolerance of the multi-bio 4910 * beyond the tolerance of the btrfs bio
4627 */ 4911 */
4628 if (atomic_read(&bbio->error) > bbio->max_errors) { 4912 if (atomic_read(&bbio->error) > bbio->max_errors) {
4629 err = -EIO; 4913 err = -EIO;
@@ -4657,13 +4941,18 @@ struct async_sched {
4657 * This will add one bio to the pending list for a device and make sure 4941 * This will add one bio to the pending list for a device and make sure
4658 * the work struct is scheduled. 4942 * the work struct is scheduled.
4659 */ 4943 */
4660static noinline void schedule_bio(struct btrfs_root *root, 4944noinline void btrfs_schedule_bio(struct btrfs_root *root,
4661 struct btrfs_device *device, 4945 struct btrfs_device *device,
4662 int rw, struct bio *bio) 4946 int rw, struct bio *bio)
4663{ 4947{
4664 int should_queue = 1; 4948 int should_queue = 1;
4665 struct btrfs_pending_bios *pending_bios; 4949 struct btrfs_pending_bios *pending_bios;
4666 4950
4951 if (device->missing || !device->bdev) {
4952 bio_endio(bio, -EIO);
4953 return;
4954 }
4955
4667 /* don't bother with additional async steps for reads, right now */ 4956 /* don't bother with additional async steps for reads, right now */
4668 if (!(rw & REQ_WRITE)) { 4957 if (!(rw & REQ_WRITE)) {
4669 bio_get(bio); 4958 bio_get(bio);
@@ -4761,7 +5050,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4761#endif 5050#endif
4762 bio->bi_bdev = dev->bdev; 5051 bio->bi_bdev = dev->bdev;
4763 if (async) 5052 if (async)
4764 schedule_bio(root, dev, rw, bio); 5053 btrfs_schedule_bio(root, dev, rw, bio);
4765 else 5054 else
4766 btrfsic_submit_bio(rw, bio); 5055 btrfsic_submit_bio(rw, bio);
4767} 5056}
@@ -4820,6 +5109,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4820 u64 logical = (u64)bio->bi_sector << 9; 5109 u64 logical = (u64)bio->bi_sector << 9;
4821 u64 length = 0; 5110 u64 length = 0;
4822 u64 map_length; 5111 u64 map_length;
5112 u64 *raid_map = NULL;
4823 int ret; 5113 int ret;
4824 int dev_nr = 0; 5114 int dev_nr = 0;
4825 int total_devs = 1; 5115 int total_devs = 1;
@@ -4828,12 +5118,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4828 length = bio->bi_size; 5118 length = bio->bi_size;
4829 map_length = length; 5119 map_length = length;
4830 5120
4831 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5121 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4832 mirror_num); 5122 mirror_num, &raid_map);
4833 if (ret) 5123 if (ret) /* -ENOMEM */
4834 return ret; 5124 return ret;
4835 5125
4836 total_devs = bbio->num_stripes; 5126 total_devs = bbio->num_stripes;
5127 bbio->orig_bio = first_bio;
5128 bbio->private = first_bio->bi_private;
5129 bbio->end_io = first_bio->bi_end_io;
5130 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5131
5132 if (raid_map) {
5133 /* In this case, map_length has been set to the length of
5134 a single stripe; not the whole write */
5135 if (rw & WRITE) {
5136 return raid56_parity_write(root, bio, bbio,
5137 raid_map, map_length);
5138 } else {
5139 return raid56_parity_recover(root, bio, bbio,
5140 raid_map, map_length,
5141 mirror_num);
5142 }
5143 }
5144
4837 if (map_length < length) { 5145 if (map_length < length) {
4838 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5146 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4839 "len %llu\n", (unsigned long long)logical, 5147 "len %llu\n", (unsigned long long)logical,
@@ -4842,11 +5150,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4842 BUG(); 5150 BUG();
4843 } 5151 }
4844 5152
4845 bbio->orig_bio = first_bio;
4846 bbio->private = first_bio->bi_private;
4847 bbio->end_io = first_bio->bi_end_io;
4848 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4849
4850 while (dev_nr < total_devs) { 5153 while (dev_nr < total_devs) {
4851 dev = bbio->stripes[dev_nr].dev; 5154 dev = bbio->stripes[dev_nr].dev;
4852 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5155 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {