aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c636
1 files changed, 533 insertions, 103 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b1672..35bb2d4ed29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
647 new_device->writeable = 0; 650 new_device->writeable = 0;
648 new_device->in_fs_metadata = 0; 651 new_device->in_fs_metadata = 0;
649 new_device->can_discard = 0; 652 new_device->can_discard = 0;
653 spin_lock_init(&new_device->io_lock);
650 list_replace_rcu(&device->dev_list, &new_device->dev_list); 654 list_replace_rcu(&device->dev_list, &new_device->dev_list);
651 655
652 call_rcu(&device->rcu, free_device); 656 call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
792 return ret; 796 return ret;
793} 797}
794 798
799/*
800 * Look for a btrfs signature on a device. This may be called out of the mount path
801 * and we are not allowed to call set_blocksize during the scan. The superblock
802 * is read via pagecache
803 */
795int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 804int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
796 struct btrfs_fs_devices **fs_devices_ret) 805 struct btrfs_fs_devices **fs_devices_ret)
797{ 806{
798 struct btrfs_super_block *disk_super; 807 struct btrfs_super_block *disk_super;
799 struct block_device *bdev; 808 struct block_device *bdev;
800 struct buffer_head *bh; 809 struct page *page;
801 int ret; 810 void *p;
811 int ret = -EINVAL;
802 u64 devid; 812 u64 devid;
803 u64 transid; 813 u64 transid;
804 u64 total_devices; 814 u64 total_devices;
815 u64 bytenr;
816 pgoff_t index;
805 817
818 /*
819 * we would like to check all the supers, but that would make
820 * a btrfs mount succeed after a mkfs from a different FS.
821 * So, we need to add a special mount option to scan for
822 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 */
824 bytenr = btrfs_sb_offset(0);
806 flags |= FMODE_EXCL; 825 flags |= FMODE_EXCL;
807 mutex_lock(&uuid_mutex); 826 mutex_lock(&uuid_mutex);
808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 827
809 if (ret) 828 bdev = blkdev_get_by_path(path, flags, holder);
829
830 if (IS_ERR(bdev)) {
831 ret = PTR_ERR(bdev);
810 goto error; 832 goto error;
811 disk_super = (struct btrfs_super_block *)bh->b_data; 833 }
834
835 /* make sure our super fits in the device */
836 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
837 goto error_bdev_put;
838
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
841 goto error_bdev_put;
842
843 /* make sure our super doesn't straddle pages on disk */
844 index = bytenr >> PAGE_CACHE_SHIFT;
845 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
846 goto error_bdev_put;
847
848 /* pull in the page with our super */
849 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
850 index, GFP_NOFS);
851
852 if (IS_ERR_OR_NULL(page))
853 goto error_bdev_put;
854
855 p = kmap(page);
856
857 /* align our pointer to the offset of the super block */
858 disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
859
860 if (btrfs_super_bytenr(disk_super) != bytenr ||
861 disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
862 goto error_unmap;
863
812 devid = btrfs_stack_device_id(&disk_super->dev_item); 864 devid = btrfs_stack_device_id(&disk_super->dev_item);
813 transid = btrfs_super_generation(disk_super); 865 transid = btrfs_super_generation(disk_super);
814 total_devices = btrfs_super_num_devices(disk_super); 866 total_devices = btrfs_super_num_devices(disk_super);
867
815 if (disk_super->label[0]) { 868 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 869 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 870 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
819 } else { 872 } else {
820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 873 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 } 874 }
875
822 printk(KERN_CONT "devid %llu transid %llu %s\n", 876 printk(KERN_CONT "devid %llu transid %llu %s\n",
823 (unsigned long long)devid, (unsigned long long)transid, path); 877 (unsigned long long)devid, (unsigned long long)transid, path);
878
824 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 879 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
825 if (!ret && fs_devices_ret) 880 if (!ret && fs_devices_ret)
826 (*fs_devices_ret)->total_devices = total_devices; 881 (*fs_devices_ret)->total_devices = total_devices;
827 brelse(bh); 882
883error_unmap:
884 kunmap(page);
885 page_cache_release(page);
886
887error_bdev_put:
828 blkdev_put(bdev, flags); 888 blkdev_put(bdev, flags);
829error: 889error:
830 mutex_unlock(&uuid_mutex); 890 mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1372 u64 devid; 1432 u64 devid;
1373 u64 num_devices; 1433 u64 num_devices;
1374 u8 *dev_uuid; 1434 u8 *dev_uuid;
1435 unsigned seq;
1375 int ret = 0; 1436 int ret = 0;
1376 bool clear_super = false; 1437 bool clear_super = false;
1377 1438
1378 mutex_lock(&uuid_mutex); 1439 mutex_lock(&uuid_mutex);
1379 1440
1380 all_avail = root->fs_info->avail_data_alloc_bits | 1441 do {
1381 root->fs_info->avail_system_alloc_bits | 1442 seq = read_seqbegin(&root->fs_info->profiles_lock);
1382 root->fs_info->avail_metadata_alloc_bits; 1443
1444 all_avail = root->fs_info->avail_data_alloc_bits |
1445 root->fs_info->avail_system_alloc_bits |
1446 root->fs_info->avail_metadata_alloc_bits;
1447 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1383 1448
1384 num_devices = root->fs_info->fs_devices->num_devices; 1449 num_devices = root->fs_info->fs_devices->num_devices;
1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1450 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1468 goto out;
1404 } 1469 }
1405 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1406 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1487 struct list_head *devices;
1408 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2616 chunk_used = btrfs_block_group_used(&cache->item); 2696 chunk_used = btrfs_block_group_used(&cache->item);
2617 2697
2618 if (bargs->usage == 0) 2698 if (bargs->usage == 0)
2619 user_thresh = 0; 2699 user_thresh = 1;
2620 else if (bargs->usage > 100) 2700 else if (bargs->usage > 100)
2621 user_thresh = cache->key.offset; 2701 user_thresh = cache->key.offset;
2622 else 2702 else
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2664 return 0; 2744 return 0;
2665 2745
2666 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2746 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2667 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2747 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2668 factor = 2; 2748 factor = num_stripes / 2;
2669 else 2749 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2670 factor = 1; 2750 factor = num_stripes - 1;
2671 factor = num_stripes / factor; 2751 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2752 factor = num_stripes - 2;
2753 } else {
2754 factor = num_stripes;
2755 }
2672 2756
2673 for (i = 0; i < num_stripes; i++) { 2757 for (i = 0; i < num_stripes; i++) {
2674 stripe = btrfs_stripe_nr(chunk, i); 2758 stripe = btrfs_stripe_nr(chunk, i);
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2985 int mixed = 0; 3069 int mixed = 0;
2986 int ret; 3070 int ret;
2987 u64 num_devices; 3071 u64 num_devices;
3072 unsigned seq;
2988 3073
2989 if (btrfs_fs_closing(fs_info) || 3074 if (btrfs_fs_closing(fs_info) ||
2990 atomic_read(&fs_info->balance_pause_req) || 3075 atomic_read(&fs_info->balance_pause_req) ||
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3027 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3112 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3028 else 3113 else
3029 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3114 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3030 BTRFS_BLOCK_GROUP_RAID10); 3115 BTRFS_BLOCK_GROUP_RAID10 |
3116 BTRFS_BLOCK_GROUP_RAID5 |
3117 BTRFS_BLOCK_GROUP_RAID6);
3031 3118
3032 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3119 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3033 (!alloc_profile_is_valid(bctl->data.target, 1) || 3120 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3067 3154
3068 /* allow to reduce meta or sys integrity only if force set */ 3155 /* allow to reduce meta or sys integrity only if force set */
3069 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3156 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3070 BTRFS_BLOCK_GROUP_RAID10; 3157 BTRFS_BLOCK_GROUP_RAID10 |
3071 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3158 BTRFS_BLOCK_GROUP_RAID5 |
3072 (fs_info->avail_system_alloc_bits & allowed) && 3159 BTRFS_BLOCK_GROUP_RAID6;
3073 !(bctl->sys.target & allowed)) || 3160 do {
3074 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3161 seq = read_seqbegin(&fs_info->profiles_lock);
3075 (fs_info->avail_metadata_alloc_bits & allowed) && 3162
3076 !(bctl->meta.target & allowed))) { 3163 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3077 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3164 (fs_info->avail_system_alloc_bits & allowed) &&
3078 printk(KERN_INFO "btrfs: force reducing metadata " 3165 !(bctl->sys.target & allowed)) ||
3079 "integrity\n"); 3166 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3080 } else { 3167 (fs_info->avail_metadata_alloc_bits & allowed) &&
3081 printk(KERN_ERR "btrfs: balance will reduce metadata " 3168 !(bctl->meta.target & allowed))) {
3082 "integrity, use force if you want this\n"); 3169 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3083 ret = -EINVAL; 3170 printk(KERN_INFO "btrfs: force reducing metadata "
3084 goto out; 3171 "integrity\n");
3172 } else {
3173 printk(KERN_ERR "btrfs: balance will reduce metadata "
3174 "integrity, use force if you want this\n");
3175 ret = -EINVAL;
3176 goto out;
3177 }
3085 } 3178 }
3086 } 3179 } while (read_seqretry(&fs_info->profiles_lock, seq));
3087 3180
3088 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3181 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3089 int num_tolerated_disk_barrier_failures; 3182 int num_tolerated_disk_barrier_failures;
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3127 mutex_lock(&fs_info->balance_mutex); 3220 mutex_lock(&fs_info->balance_mutex);
3128 atomic_dec(&fs_info->balance_running); 3221 atomic_dec(&fs_info->balance_running);
3129 3222
3130 if (bargs) {
3131 memset(bargs, 0, sizeof(*bargs));
3132 update_ioctl_balance_args(fs_info, 0, bargs);
3133 }
3134
3135 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3136 balance_need_close(fs_info)) {
3137 __cancel_balance(fs_info);
3138 }
3139
3140 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3223 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3141 fs_info->num_tolerated_disk_barrier_failures = 3224 fs_info->num_tolerated_disk_barrier_failures =
3142 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3225 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3143 } 3226 }
3144 3227
3228 if (bargs) {
3229 memset(bargs, 0, sizeof(*bargs));
3230 update_ioctl_balance_args(fs_info, 0, bargs);
3231 }
3232
3145 wake_up(&fs_info->balance_wait_q); 3233 wake_up(&fs_info->balance_wait_q);
3146 3234
3147 return ret; 3235 return ret;
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3504} 3592}
3505 3593
3506struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3594struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3507 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3595 [BTRFS_RAID_RAID10] = {
3508 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3596 .sub_stripes = 2,
3509 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3597 .dev_stripes = 1,
3510 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3598 .devs_max = 0, /* 0 == as many as possible */
3511 { 1, 1, 1, 1, 1, 1 /* single */ }, 3599 .devs_min = 4,
3600 .devs_increment = 2,
3601 .ncopies = 2,
3602 },
3603 [BTRFS_RAID_RAID1] = {
3604 .sub_stripes = 1,
3605 .dev_stripes = 1,
3606 .devs_max = 2,
3607 .devs_min = 2,
3608 .devs_increment = 2,
3609 .ncopies = 2,
3610 },
3611 [BTRFS_RAID_DUP] = {
3612 .sub_stripes = 1,
3613 .dev_stripes = 2,
3614 .devs_max = 1,
3615 .devs_min = 1,
3616 .devs_increment = 1,
3617 .ncopies = 2,
3618 },
3619 [BTRFS_RAID_RAID0] = {
3620 .sub_stripes = 1,
3621 .dev_stripes = 1,
3622 .devs_max = 0,
3623 .devs_min = 2,
3624 .devs_increment = 1,
3625 .ncopies = 1,
3626 },
3627 [BTRFS_RAID_SINGLE] = {
3628 .sub_stripes = 1,
3629 .dev_stripes = 1,
3630 .devs_max = 1,
3631 .devs_min = 1,
3632 .devs_increment = 1,
3633 .ncopies = 1,
3634 },
3635 [BTRFS_RAID_RAID5] = {
3636 .sub_stripes = 1,
3637 .dev_stripes = 1,
3638 .devs_max = 0,
3639 .devs_min = 2,
3640 .devs_increment = 1,
3641 .ncopies = 2,
3642 },
3643 [BTRFS_RAID_RAID6] = {
3644 .sub_stripes = 1,
3645 .dev_stripes = 1,
3646 .devs_max = 0,
3647 .devs_min = 3,
3648 .devs_increment = 1,
3649 .ncopies = 3,
3650 },
3512}; 3651};
3513 3652
3653static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3654{
3655 /* TODO allow them to set a preferred stripe size */
3656 return 64 * 1024;
3657}
3658
3659static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3660{
3661 u64 features;
3662
3663 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3664 return;
3665
3666 features = btrfs_super_incompat_flags(info->super_copy);
3667 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3668 return;
3669
3670 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3671 btrfs_set_super_incompat_flags(info->super_copy, features);
3672 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3673}
3674
3514static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3675static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_root *extent_root, 3676 struct btrfs_root *extent_root,
3516 struct map_lookup **map_ret, 3677 struct map_lookup **map_ret,
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 struct btrfs_device_info *devices_info = NULL; 3687 struct btrfs_device_info *devices_info = NULL;
3527 u64 total_avail; 3688 u64 total_avail;
3528 int num_stripes; /* total number of stripes to allocate */ 3689 int num_stripes; /* total number of stripes to allocate */
3690 int data_stripes; /* number of stripes that count for
3691 block group size */
3529 int sub_stripes; /* sub_stripes info for map */ 3692 int sub_stripes; /* sub_stripes info for map */
3530 int dev_stripes; /* stripes per dev */ 3693 int dev_stripes; /* stripes per dev */
3531 int devs_max; /* max devs to use */ 3694 int devs_max; /* max devs to use */
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3537 u64 max_chunk_size; 3700 u64 max_chunk_size;
3538 u64 stripe_size; 3701 u64 stripe_size;
3539 u64 num_bytes; 3702 u64 num_bytes;
3703 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3540 int ndevs; 3704 int ndevs;
3541 int i; 3705 int i;
3542 int j; 3706 int j;
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3631 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3795 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3632 continue; 3796 continue;
3633 3797
3798 if (ndevs == fs_devices->rw_devices) {
3799 WARN(1, "%s: found more than %llu devices\n",
3800 __func__, fs_devices->rw_devices);
3801 break;
3802 }
3634 devices_info[ndevs].dev_offset = dev_offset; 3803 devices_info[ndevs].dev_offset = dev_offset;
3635 devices_info[ndevs].max_avail = max_avail; 3804 devices_info[ndevs].max_avail = max_avail;
3636 devices_info[ndevs].total_avail = total_avail; 3805 devices_info[ndevs].total_avail = total_avail;
3637 devices_info[ndevs].dev = device; 3806 devices_info[ndevs].dev = device;
3638 ++ndevs; 3807 ++ndevs;
3639 WARN_ON(ndevs > fs_devices->rw_devices);
3640 } 3808 }
3641 3809
3642 /* 3810 /*
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3662 stripe_size = devices_info[ndevs-1].max_avail; 3830 stripe_size = devices_info[ndevs-1].max_avail;
3663 num_stripes = ndevs * dev_stripes; 3831 num_stripes = ndevs * dev_stripes;
3664 3832
3665 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3833 /*
3666 stripe_size = max_chunk_size * ncopies; 3834 * this will have to be fixed for RAID1 and RAID10 over
3667 do_div(stripe_size, ndevs); 3835 * more drives
3836 */
3837 data_stripes = num_stripes / ncopies;
3838
3839 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3840 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3841 btrfs_super_stripesize(info->super_copy));
3842 data_stripes = num_stripes - 1;
3843 }
3844 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3845 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3846 btrfs_super_stripesize(info->super_copy));
3847 data_stripes = num_stripes - 2;
3848 }
3849
3850 /*
3851 * Use the number of data stripes to figure out how big this chunk
3852 * is really going to be in terms of logical address space,
3853 * and compare that answer with the max chunk size
3854 */
3855 if (stripe_size * data_stripes > max_chunk_size) {
3856 u64 mask = (1ULL << 24) - 1;
3857 stripe_size = max_chunk_size;
3858 do_div(stripe_size, data_stripes);
3859
3860 /* bump the answer up to a 16MB boundary */
3861 stripe_size = (stripe_size + mask) & ~mask;
3862
3863 /* but don't go higher than the limits we found
3864 * while searching for free extents
3865 */
3866 if (stripe_size > devices_info[ndevs-1].max_avail)
3867 stripe_size = devices_info[ndevs-1].max_avail;
3668 } 3868 }
3669 3869
3670 do_div(stripe_size, dev_stripes); 3870 do_div(stripe_size, dev_stripes);
3671 3871
3672 /* align to BTRFS_STRIPE_LEN */ 3872 /* align to BTRFS_STRIPE_LEN */
3673 do_div(stripe_size, BTRFS_STRIPE_LEN); 3873 do_div(stripe_size, raid_stripe_len);
3674 stripe_size *= BTRFS_STRIPE_LEN; 3874 stripe_size *= raid_stripe_len;
3675 3875
3676 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3876 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3677 if (!map) { 3877 if (!map) {
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3689 } 3889 }
3690 } 3890 }
3691 map->sector_size = extent_root->sectorsize; 3891 map->sector_size = extent_root->sectorsize;
3692 map->stripe_len = BTRFS_STRIPE_LEN; 3892 map->stripe_len = raid_stripe_len;
3693 map->io_align = BTRFS_STRIPE_LEN; 3893 map->io_align = raid_stripe_len;
3694 map->io_width = BTRFS_STRIPE_LEN; 3894 map->io_width = raid_stripe_len;
3695 map->type = type; 3895 map->type = type;
3696 map->sub_stripes = sub_stripes; 3896 map->sub_stripes = sub_stripes;
3697 3897
3698 *map_ret = map; 3898 *map_ret = map;
3699 num_bytes = stripe_size * (num_stripes / ncopies); 3899 num_bytes = stripe_size * data_stripes;
3700 3900
3701 *stripe_size_out = stripe_size; 3901 *stripe_size_out = stripe_size;
3702 *num_bytes_out = num_bytes; 3902 *num_bytes_out = num_bytes;
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3718 write_lock(&em_tree->lock); 3918 write_lock(&em_tree->lock);
3719 ret = add_extent_mapping(em_tree, em); 3919 ret = add_extent_mapping(em_tree, em);
3720 write_unlock(&em_tree->lock); 3920 write_unlock(&em_tree->lock);
3721 free_extent_map(em); 3921 if (ret) {
3722 if (ret) 3922 free_extent_map(em);
3723 goto error;
3724
3725 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3726 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3727 start, num_bytes);
3728 if (ret)
3729 goto error; 3923 goto error;
3924 }
3730 3925
3731 for (i = 0; i < map->num_stripes; ++i) { 3926 for (i = 0; i < map->num_stripes; ++i) {
3732 struct btrfs_device *device; 3927 struct btrfs_device *device;
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3739 info->chunk_root->root_key.objectid, 3934 info->chunk_root->root_key.objectid,
3740 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3935 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3741 start, dev_offset, stripe_size); 3936 start, dev_offset, stripe_size);
3742 if (ret) { 3937 if (ret)
3743 btrfs_abort_transaction(trans, extent_root, ret); 3938 goto error_dev_extent;
3744 goto error; 3939 }
3745 } 3940
3941 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3942 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3943 start, num_bytes);
3944 if (ret) {
3945 i = map->num_stripes - 1;
3946 goto error_dev_extent;
3746 } 3947 }
3747 3948
3949 free_extent_map(em);
3950 check_raid56_incompat_flag(extent_root->fs_info, type);
3951
3748 kfree(devices_info); 3952 kfree(devices_info);
3749 return 0; 3953 return 0;
3750 3954
3955error_dev_extent:
3956 for (; i >= 0; i--) {
3957 struct btrfs_device *device;
3958 int err;
3959
3960 device = map->stripes[i].dev;
3961 err = btrfs_free_dev_extent(trans, device, start);
3962 if (err) {
3963 btrfs_abort_transaction(trans, extent_root, err);
3964 break;
3965 }
3966 }
3967 write_lock(&em_tree->lock);
3968 remove_extent_mapping(em_tree, em);
3969 write_unlock(&em_tree->lock);
3970
3971 /* One for our allocation */
3972 free_extent_map(em);
3973 /* One for the tree reference */
3974 free_extent_map(em);
3751error: 3975error:
3752 kfree(map); 3976 kfree(map);
3753 kfree(devices_info); 3977 kfree(devices_info);
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3887 if (ret) 4111 if (ret)
3888 return ret; 4112 return ret;
3889 4113
3890 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 4114 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
3891 fs_info->avail_metadata_alloc_bits;
3892 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3893
3894 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4115 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3895 &stripe_size, chunk_offset, alloc_profile); 4116 &stripe_size, chunk_offset, alloc_profile);
3896 if (ret) 4117 if (ret)
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3898 4119
3899 sys_chunk_offset = chunk_offset + chunk_size; 4120 sys_chunk_offset = chunk_offset + chunk_size;
3900 4121
3901 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 4122 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
3902 fs_info->avail_system_alloc_bits;
3903 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3904
3905 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4123 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3906 &sys_chunk_size, &sys_stripe_size, 4124 &sys_chunk_size, &sys_stripe_size,
3907 sys_chunk_offset, alloc_profile); 4125 sys_chunk_offset, alloc_profile);
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4014 ret = map->num_stripes; 4232 ret = map->num_stripes;
4015 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4233 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4016 ret = map->sub_stripes; 4234 ret = map->sub_stripes;
4235 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4236 ret = 2;
4237 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4238 ret = 3;
4017 else 4239 else
4018 ret = 1; 4240 ret = 1;
4019 free_extent_map(em); 4241 free_extent_map(em);
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4026 return ret; 4248 return ret;
4027} 4249}
4028 4250
4251unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4252 struct btrfs_mapping_tree *map_tree,
4253 u64 logical)
4254{
4255 struct extent_map *em;
4256 struct map_lookup *map;
4257 struct extent_map_tree *em_tree = &map_tree->map_tree;
4258 unsigned long len = root->sectorsize;
4259
4260 read_lock(&em_tree->lock);
4261 em = lookup_extent_mapping(em_tree, logical, len);
4262 read_unlock(&em_tree->lock);
4263 BUG_ON(!em);
4264
4265 BUG_ON(em->start > logical || em->start + em->len < logical);
4266 map = (struct map_lookup *)em->bdev;
4267 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4268 BTRFS_BLOCK_GROUP_RAID6)) {
4269 len = map->stripe_len * nr_data_stripes(map);
4270 }
4271 free_extent_map(em);
4272 return len;
4273}
4274
4275int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4276 u64 logical, u64 len, int mirror_num)
4277{
4278 struct extent_map *em;
4279 struct map_lookup *map;
4280 struct extent_map_tree *em_tree = &map_tree->map_tree;
4281 int ret = 0;
4282
4283 read_lock(&em_tree->lock);
4284 em = lookup_extent_mapping(em_tree, logical, len);
4285 read_unlock(&em_tree->lock);
4286 BUG_ON(!em);
4287
4288 BUG_ON(em->start > logical || em->start + em->len < logical);
4289 map = (struct map_lookup *)em->bdev;
4290 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4291 BTRFS_BLOCK_GROUP_RAID6))
4292 ret = 1;
4293 free_extent_map(em);
4294 return ret;
4295}
4296
4029static int find_live_mirror(struct btrfs_fs_info *fs_info, 4297static int find_live_mirror(struct btrfs_fs_info *fs_info,
4030 struct map_lookup *map, int first, int num, 4298 struct map_lookup *map, int first, int num,
4031 int optimal, int dev_replace_is_ongoing) 4299 int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4063 return optimal; 4331 return optimal;
4064} 4332}
4065 4333
4334static inline int parity_smaller(u64 a, u64 b)
4335{
4336 return a > b;
4337}
4338
4339/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4340static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4341{
4342 struct btrfs_bio_stripe s;
4343 int i;
4344 u64 l;
4345 int again = 1;
4346
4347 while (again) {
4348 again = 0;
4349 for (i = 0; i < bbio->num_stripes - 1; i++) {
4350 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4351 s = bbio->stripes[i];
4352 l = raid_map[i];
4353 bbio->stripes[i] = bbio->stripes[i+1];
4354 raid_map[i] = raid_map[i+1];
4355 bbio->stripes[i+1] = s;
4356 raid_map[i+1] = l;
4357 again = 1;
4358 }
4359 }
4360 }
4361}
4362
4066static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4363static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 logical, u64 *length, 4364 u64 logical, u64 *length,
4068 struct btrfs_bio **bbio_ret, 4365 struct btrfs_bio **bbio_ret,
4069 int mirror_num) 4366 int mirror_num, u64 **raid_map_ret)
4070{ 4367{
4071 struct extent_map *em; 4368 struct extent_map *em;
4072 struct map_lookup *map; 4369 struct map_lookup *map;
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 u64 stripe_nr; 4375 u64 stripe_nr;
4079 u64 stripe_nr_orig; 4376 u64 stripe_nr_orig;
4080 u64 stripe_nr_end; 4377 u64 stripe_nr_end;
4378 u64 stripe_len;
4379 u64 *raid_map = NULL;
4081 int stripe_index; 4380 int stripe_index;
4082 int i; 4381 int i;
4083 int ret = 0; 4382 int ret = 0;
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4089 int num_alloc_stripes; 4388 int num_alloc_stripes;
4090 int patch_the_first_stripe_for_dev_replace = 0; 4389 int patch_the_first_stripe_for_dev_replace = 0;
4091 u64 physical_to_patch_in_first_stripe = 0; 4390 u64 physical_to_patch_in_first_stripe = 0;
4391 u64 raid56_full_stripe_start = (u64)-1;
4092 4392
4093 read_lock(&em_tree->lock); 4393 read_lock(&em_tree->lock);
4094 em = lookup_extent_mapping(em_tree, logical, *length); 4394 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4105 map = (struct map_lookup *)em->bdev; 4405 map = (struct map_lookup *)em->bdev;
4106 offset = logical - em->start; 4406 offset = logical - em->start;
4107 4407
4408 if (mirror_num > map->num_stripes)
4409 mirror_num = 0;
4410
4411 stripe_len = map->stripe_len;
4108 stripe_nr = offset; 4412 stripe_nr = offset;
4109 /* 4413 /*
4110 * stripe_nr counts the total number of stripes we have to stride 4414 * stripe_nr counts the total number of stripes we have to stride
4111 * to get to this block 4415 * to get to this block
4112 */ 4416 */
4113 do_div(stripe_nr, map->stripe_len); 4417 do_div(stripe_nr, stripe_len);
4114 4418
4115 stripe_offset = stripe_nr * map->stripe_len; 4419 stripe_offset = stripe_nr * stripe_len;
4116 BUG_ON(offset < stripe_offset); 4420 BUG_ON(offset < stripe_offset);
4117 4421
4118 /* stripe_offset is the offset of this block in its stripe*/ 4422 /* stripe_offset is the offset of this block in its stripe*/
4119 stripe_offset = offset - stripe_offset; 4423 stripe_offset = offset - stripe_offset;
4120 4424
4121 if (rw & REQ_DISCARD) 4425 /* if we're here for raid56, we need to know the stripe aligned start */
4426 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4427 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4428 raid56_full_stripe_start = offset;
4429
4430 /* allow a write of a full stripe, but make sure we don't
4431 * allow straddling of stripes
4432 */
4433 do_div(raid56_full_stripe_start, full_stripe_len);
4434 raid56_full_stripe_start *= full_stripe_len;
4435 }
4436
4437 if (rw & REQ_DISCARD) {
4438 /* we don't discard raid56 yet */
4439 if (map->type &
4440 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4441 ret = -EOPNOTSUPP;
4442 goto out;
4443 }
4122 *length = min_t(u64, em->len - offset, *length); 4444 *length = min_t(u64, em->len - offset, *length);
4123 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4445 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4124 /* we limit the length of each bio to what fits in a stripe */ 4446 u64 max_len;
4125 *length = min_t(u64, em->len - offset, 4447 /* For writes to RAID[56], allow a full stripeset across all disks.
4126 map->stripe_len - stripe_offset); 4448 For other RAID types and for RAID[56] reads, just allow a single
4449 stripe (on a single disk). */
4450 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4451 (rw & REQ_WRITE)) {
4452 max_len = stripe_len * nr_data_stripes(map) -
4453 (offset - raid56_full_stripe_start);
4454 } else {
4455 /* we limit the length of each bio to what fits in a stripe */
4456 max_len = stripe_len - stripe_offset;
4457 }
4458 *length = min_t(u64, em->len - offset, max_len);
4127 } else { 4459 } else {
4128 *length = em->len - offset; 4460 *length = em->len - offset;
4129 } 4461 }
4130 4462
4463 /* This is for when we're called from btrfs_merge_bio_hook() and all
4464 it cares about is the length */
4131 if (!bbio_ret) 4465 if (!bbio_ret)
4132 goto out; 4466 goto out;
4133 4467
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4160 u64 physical_of_found = 0; 4494 u64 physical_of_found = 0;
4161 4495
4162 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4496 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4163 logical, &tmp_length, &tmp_bbio, 0); 4497 logical, &tmp_length, &tmp_bbio, 0, NULL);
4164 if (ret) { 4498 if (ret) {
4165 WARN_ON(tmp_bbio != NULL); 4499 WARN_ON(tmp_bbio != NULL);
4166 goto out; 4500 goto out;
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4221 num_stripes = 1; 4555 num_stripes = 1;
4222 stripe_index = 0; 4556 stripe_index = 0;
4223 stripe_nr_orig = stripe_nr; 4557 stripe_nr_orig = stripe_nr;
4224 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 4558 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4225 (~(map->stripe_len - 1));
4226 do_div(stripe_nr_end, map->stripe_len); 4559 do_div(stripe_nr_end, map->stripe_len);
4227 stripe_end_offset = stripe_nr_end * map->stripe_len - 4560 stripe_end_offset = stripe_nr_end * map->stripe_len -
4228 (offset + *length); 4561 (offset + *length);
4562
4229 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4563 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4230 if (rw & REQ_DISCARD) 4564 if (rw & REQ_DISCARD)
4231 num_stripes = min_t(u64, map->num_stripes, 4565 num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4276 dev_replace_is_ongoing); 4610 dev_replace_is_ongoing);
4277 mirror_num = stripe_index - old_stripe_index + 1; 4611 mirror_num = stripe_index - old_stripe_index + 1;
4278 } 4612 }
4613
4614 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4615 BTRFS_BLOCK_GROUP_RAID6)) {
4616 u64 tmp;
4617
4618 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4619 && raid_map_ret) {
4620 int i, rot;
4621
4622 /* push stripe_nr back to the start of the full stripe */
4623 stripe_nr = raid56_full_stripe_start;
4624 do_div(stripe_nr, stripe_len);
4625
4626 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4627
4628 /* RAID[56] write or recovery. Return all stripes */
4629 num_stripes = map->num_stripes;
4630 max_errors = nr_parity_stripes(map);
4631
4632 raid_map = kmalloc(sizeof(u64) * num_stripes,
4633 GFP_NOFS);
4634 if (!raid_map) {
4635 ret = -ENOMEM;
4636 goto out;
4637 }
4638
4639 /* Work out the disk rotation on this stripe-set */
4640 tmp = stripe_nr;
4641 rot = do_div(tmp, num_stripes);
4642
4643 /* Fill in the logical address of each stripe */
4644 tmp = stripe_nr * nr_data_stripes(map);
4645 for (i = 0; i < nr_data_stripes(map); i++)
4646 raid_map[(i+rot) % num_stripes] =
4647 em->start + (tmp + i) * map->stripe_len;
4648
4649 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4650 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4651 raid_map[(i+rot+1) % num_stripes] =
4652 RAID6_Q_STRIPE;
4653
4654 *length = map->stripe_len;
4655 stripe_index = 0;
4656 stripe_offset = 0;
4657 } else {
4658 /*
4659 * Mirror #0 or #1 means the original data block.
4660 * Mirror #2 is RAID5 parity block.
4661 * Mirror #3 is RAID6 Q block.
4662 */
4663 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4664 if (mirror_num > 1)
4665 stripe_index = nr_data_stripes(map) +
4666 mirror_num - 2;
4667
4668 /* We distribute the parity blocks across stripes */
4669 tmp = stripe_nr + stripe_index;
4670 stripe_index = do_div(tmp, map->num_stripes);
4671 }
4279 } else { 4672 } else {
4280 /* 4673 /*
4281 * after this do_div call, stripe_nr is the number of stripes 4674 * after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4384 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4777 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4385 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4778 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4386 BTRFS_BLOCK_GROUP_RAID10 | 4779 BTRFS_BLOCK_GROUP_RAID10 |
4780 BTRFS_BLOCK_GROUP_RAID5 |
4387 BTRFS_BLOCK_GROUP_DUP)) { 4781 BTRFS_BLOCK_GROUP_DUP)) {
4388 max_errors = 1; 4782 max_errors = 1;
4783 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4784 max_errors = 2;
4389 } 4785 }
4390 } 4786 }
4391 4787
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4486 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4882 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4487 bbio->mirror_num = map->num_stripes + 1; 4883 bbio->mirror_num = map->num_stripes + 1;
4488 } 4884 }
4885 if (raid_map) {
4886 sort_parity_stripes(bbio, raid_map);
4887 *raid_map_ret = raid_map;
4888 }
4489out: 4889out:
4490 if (dev_replace_is_ongoing) 4890 if (dev_replace_is_ongoing)
4491 btrfs_dev_replace_unlock(dev_replace); 4891 btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4498 struct btrfs_bio **bbio_ret, int mirror_num) 4898 struct btrfs_bio **bbio_ret, int mirror_num)
4499{ 4899{
4500 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4900 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4501 mirror_num); 4901 mirror_num, NULL);
4502} 4902}
4503 4903
4504int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4904int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4512 u64 bytenr; 4912 u64 bytenr;
4513 u64 length; 4913 u64 length;
4514 u64 stripe_nr; 4914 u64 stripe_nr;
4915 u64 rmap_len;
4515 int i, j, nr = 0; 4916 int i, j, nr = 0;
4516 4917
4517 read_lock(&em_tree->lock); 4918 read_lock(&em_tree->lock);
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4522 map = (struct map_lookup *)em->bdev; 4923 map = (struct map_lookup *)em->bdev;
4523 4924
4524 length = em->len; 4925 length = em->len;
4926 rmap_len = map->stripe_len;
4927
4525 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4928 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4526 do_div(length, map->num_stripes / map->sub_stripes); 4929 do_div(length, map->num_stripes / map->sub_stripes);
4527 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4930 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4528 do_div(length, map->num_stripes); 4931 do_div(length, map->num_stripes);
4932 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4933 BTRFS_BLOCK_GROUP_RAID6)) {
4934 do_div(length, nr_data_stripes(map));
4935 rmap_len = map->stripe_len * nr_data_stripes(map);
4936 }
4529 4937
4530 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4938 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4531 BUG_ON(!buf); /* -ENOMEM */ 4939 BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4545 do_div(stripe_nr, map->sub_stripes); 4953 do_div(stripe_nr, map->sub_stripes);
4546 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4954 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4547 stripe_nr = stripe_nr * map->num_stripes + i; 4955 stripe_nr = stripe_nr * map->num_stripes + i;
4548 } 4956 } /* else if RAID[56], multiply by nr_data_stripes().
4549 bytenr = chunk_start + stripe_nr * map->stripe_len; 4957 * Alternatively, just use rmap_len below instead of
4958 * map->stripe_len */
4959
4960 bytenr = chunk_start + stripe_nr * rmap_len;
4550 WARN_ON(nr >= map->num_stripes); 4961 WARN_ON(nr >= map->num_stripes);
4551 for (j = 0; j < nr; j++) { 4962 for (j = 0; j < nr; j++) {
4552 if (buf[j] == bytenr) 4963 if (buf[j] == bytenr)
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4560 4971
4561 *logical = buf; 4972 *logical = buf;
4562 *naddrs = nr; 4973 *naddrs = nr;
4563 *stripe_len = map->stripe_len; 4974 *stripe_len = rmap_len;
4564 4975
4565 free_extent_map(em); 4976 free_extent_map(em);
4566 return 0; 4977 return 0;
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4634 bio->bi_bdev = (struct block_device *) 5045 bio->bi_bdev = (struct block_device *)
4635 (unsigned long)bbio->mirror_num; 5046 (unsigned long)bbio->mirror_num;
4636 /* only send an error to the higher layers if it is 5047 /* only send an error to the higher layers if it is
4637 * beyond the tolerance of the multi-bio 5048 * beyond the tolerance of the btrfs bio
4638 */ 5049 */
4639 if (atomic_read(&bbio->error) > bbio->max_errors) { 5050 if (atomic_read(&bbio->error) > bbio->max_errors) {
4640 err = -EIO; 5051 err = -EIO;
@@ -4668,13 +5079,18 @@ struct async_sched {
4668 * This will add one bio to the pending list for a device and make sure 5079 * This will add one bio to the pending list for a device and make sure
4669 * the work struct is scheduled. 5080 * the work struct is scheduled.
4670 */ 5081 */
4671static noinline void schedule_bio(struct btrfs_root *root, 5082noinline void btrfs_schedule_bio(struct btrfs_root *root,
4672 struct btrfs_device *device, 5083 struct btrfs_device *device,
4673 int rw, struct bio *bio) 5084 int rw, struct bio *bio)
4674{ 5085{
4675 int should_queue = 1; 5086 int should_queue = 1;
4676 struct btrfs_pending_bios *pending_bios; 5087 struct btrfs_pending_bios *pending_bios;
4677 5088
5089 if (device->missing || !device->bdev) {
5090 bio_endio(bio, -EIO);
5091 return;
5092 }
5093
4678 /* don't bother with additional async steps for reads, right now */ 5094 /* don't bother with additional async steps for reads, right now */
4679 if (!(rw & REQ_WRITE)) { 5095 if (!(rw & REQ_WRITE)) {
4680 bio_get(bio); 5096 bio_get(bio);
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4772#endif 5188#endif
4773 bio->bi_bdev = dev->bdev; 5189 bio->bi_bdev = dev->bdev;
4774 if (async) 5190 if (async)
4775 schedule_bio(root, dev, rw, bio); 5191 btrfs_schedule_bio(root, dev, rw, bio);
4776 else 5192 else
4777 btrfsic_submit_bio(rw, bio); 5193 btrfsic_submit_bio(rw, bio);
4778} 5194}
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4831 u64 logical = (u64)bio->bi_sector << 9; 5247 u64 logical = (u64)bio->bi_sector << 9;
4832 u64 length = 0; 5248 u64 length = 0;
4833 u64 map_length; 5249 u64 map_length;
5250 u64 *raid_map = NULL;
4834 int ret; 5251 int ret;
4835 int dev_nr = 0; 5252 int dev_nr = 0;
4836 int total_devs = 1; 5253 int total_devs = 1;
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4839 length = bio->bi_size; 5256 length = bio->bi_size;
4840 map_length = length; 5257 map_length = length;
4841 5258
4842 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5259 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4843 mirror_num); 5260 mirror_num, &raid_map);
4844 if (ret) 5261 if (ret) /* -ENOMEM */
4845 return ret; 5262 return ret;
4846 5263
4847 total_devs = bbio->num_stripes; 5264 total_devs = bbio->num_stripes;
5265 bbio->orig_bio = first_bio;
5266 bbio->private = first_bio->bi_private;
5267 bbio->end_io = first_bio->bi_end_io;
5268 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5269
5270 if (raid_map) {
5271 /* In this case, map_length has been set to the length of
5272 a single stripe; not the whole write */
5273 if (rw & WRITE) {
5274 return raid56_parity_write(root, bio, bbio,
5275 raid_map, map_length);
5276 } else {
5277 return raid56_parity_recover(root, bio, bbio,
5278 raid_map, map_length,
5279 mirror_num);
5280 }
5281 }
5282
4848 if (map_length < length) { 5283 if (map_length < length) {
4849 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5284 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4850 "len %llu\n", (unsigned long long)logical, 5285 "len %llu\n", (unsigned long long)logical,
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4853 BUG(); 5288 BUG();
4854 } 5289 }
4855 5290
4856 bbio->orig_bio = first_bio;
4857 bbio->private = first_bio->bi_private;
4858 bbio->end_io = first_bio->bi_end_io;
4859 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4860
4861 while (dev_nr < total_devs) { 5291 while (dev_nr < total_devs) {
4862 dev = bbio->stripes[dev_nr].dev; 5292 dev = bbio->stripes[dev_nr].dev;
4863 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5293 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {