aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r--fs/btrfs/volumes.c662
1 files changed, 557 insertions, 105 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..6b9cff42265d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
647 new_device->writeable = 0; 650 new_device->writeable = 0;
648 new_device->in_fs_metadata = 0; 651 new_device->in_fs_metadata = 0;
649 new_device->can_discard = 0; 652 new_device->can_discard = 0;
653 spin_lock_init(&new_device->io_lock);
650 list_replace_rcu(&device->dev_list, &new_device->dev_list); 654 list_replace_rcu(&device->dev_list, &new_device->dev_list);
651 655
652 call_rcu(&device->rcu, free_device); 656 call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
792 return ret; 796 return ret;
793} 797}
794 798
799/*
800 * Look for a btrfs signature on a device. This may be called out of the mount path
801 * and we are not allowed to call set_blocksize during the scan. The superblock
802 * is read via pagecache
803 */
795int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 804int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
796 struct btrfs_fs_devices **fs_devices_ret) 805 struct btrfs_fs_devices **fs_devices_ret)
797{ 806{
798 struct btrfs_super_block *disk_super; 807 struct btrfs_super_block *disk_super;
799 struct block_device *bdev; 808 struct block_device *bdev;
800 struct buffer_head *bh; 809 struct page *page;
801 int ret; 810 void *p;
811 int ret = -EINVAL;
802 u64 devid; 812 u64 devid;
803 u64 transid; 813 u64 transid;
804 u64 total_devices; 814 u64 total_devices;
815 u64 bytenr;
816 pgoff_t index;
805 817
818 /*
819 * we would like to check all the supers, but that would make
820 * a btrfs mount succeed after a mkfs from a different FS.
821 * So, we need to add a special mount option to scan for
822 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
823 */
824 bytenr = btrfs_sb_offset(0);
806 flags |= FMODE_EXCL; 825 flags |= FMODE_EXCL;
807 mutex_lock(&uuid_mutex); 826 mutex_lock(&uuid_mutex);
808 ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); 827
809 if (ret) 828 bdev = blkdev_get_by_path(path, flags, holder);
829
830 if (IS_ERR(bdev)) {
831 ret = PTR_ERR(bdev);
810 goto error; 832 goto error;
811 disk_super = (struct btrfs_super_block *)bh->b_data; 833 }
834
835 /* make sure our super fits in the device */
836 if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
837 goto error_bdev_put;
838
839 /* make sure our super fits in the page */
840 if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
841 goto error_bdev_put;
842
843 /* make sure our super doesn't straddle pages on disk */
844 index = bytenr >> PAGE_CACHE_SHIFT;
845 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
846 goto error_bdev_put;
847
848 /* pull in the page with our super */
849 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
850 index, GFP_NOFS);
851
852 if (IS_ERR_OR_NULL(page))
853 goto error_bdev_put;
854
855 p = kmap(page);
856
857 /* align our pointer to the offset of the super block */
858 disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
859
860 if (btrfs_super_bytenr(disk_super) != bytenr ||
861 disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
862 goto error_unmap;
863
812 devid = btrfs_stack_device_id(&disk_super->dev_item); 864 devid = btrfs_stack_device_id(&disk_super->dev_item);
813 transid = btrfs_super_generation(disk_super); 865 transid = btrfs_super_generation(disk_super);
814 total_devices = btrfs_super_num_devices(disk_super); 866 total_devices = btrfs_super_num_devices(disk_super);
867
815 if (disk_super->label[0]) { 868 if (disk_super->label[0]) {
816 if (disk_super->label[BTRFS_LABEL_SIZE - 1]) 869 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
817 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; 870 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
819 } else { 872 } else {
820 printk(KERN_INFO "device fsid %pU ", disk_super->fsid); 873 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
821 } 874 }
875
822 printk(KERN_CONT "devid %llu transid %llu %s\n", 876 printk(KERN_CONT "devid %llu transid %llu %s\n",
823 (unsigned long long)devid, (unsigned long long)transid, path); 877 (unsigned long long)devid, (unsigned long long)transid, path);
878
824 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 879 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
825 if (!ret && fs_devices_ret) 880 if (!ret && fs_devices_ret)
826 (*fs_devices_ret)->total_devices = total_devices; 881 (*fs_devices_ret)->total_devices = total_devices;
827 brelse(bh); 882
883error_unmap:
884 kunmap(page);
885 page_cache_release(page);
886
887error_bdev_put:
828 blkdev_put(bdev, flags); 888 blkdev_put(bdev, flags);
829error: 889error:
830 mutex_unlock(&uuid_mutex); 890 mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1372 u64 devid; 1432 u64 devid;
1373 u64 num_devices; 1433 u64 num_devices;
1374 u8 *dev_uuid; 1434 u8 *dev_uuid;
1435 unsigned seq;
1375 int ret = 0; 1436 int ret = 0;
1376 bool clear_super = false; 1437 bool clear_super = false;
1377 1438
1378 mutex_lock(&uuid_mutex); 1439 mutex_lock(&uuid_mutex);
1379 1440
1380 all_avail = root->fs_info->avail_data_alloc_bits | 1441 do {
1381 root->fs_info->avail_system_alloc_bits | 1442 seq = read_seqbegin(&root->fs_info->profiles_lock);
1382 root->fs_info->avail_metadata_alloc_bits; 1443
1444 all_avail = root->fs_info->avail_data_alloc_bits |
1445 root->fs_info->avail_system_alloc_bits |
1446 root->fs_info->avail_metadata_alloc_bits;
1447 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1383 1448
1384 num_devices = root->fs_info->fs_devices->num_devices; 1449 num_devices = root->fs_info->fs_devices->num_devices;
1385 btrfs_dev_replace_lock(&root->fs_info->dev_replace); 1450 btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1468 goto out;
1404 } 1469 }
1405 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1406 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1487 struct list_head *devices;
1408 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -1431,7 +1511,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1431 } 1511 }
1432 } else { 1512 } else {
1433 ret = btrfs_get_bdev_and_sb(device_path, 1513 ret = btrfs_get_bdev_and_sb(device_path,
1434 FMODE_READ | FMODE_EXCL, 1514 FMODE_WRITE | FMODE_EXCL,
1435 root->fs_info->bdev_holder, 0, 1515 root->fs_info->bdev_holder, 0,
1436 &bdev, &bh); 1516 &bdev, &bh);
1437 if (ret) 1517 if (ret)
@@ -1556,7 +1636,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1556 ret = 0; 1636 ret = 0;
1557 1637
1558 /* Notify udev that device has changed */ 1638 /* Notify udev that device has changed */
1559 btrfs_kobject_uevent(bdev, KOBJ_CHANGE); 1639 if (bdev)
1640 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1560 1641
1561error_brelse: 1642error_brelse:
1562 brelse(bh); 1643 brelse(bh);
@@ -2298,7 +2379,11 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2298 return ret; 2379 return ret;
2299 2380
2300 trans = btrfs_start_transaction(root, 0); 2381 trans = btrfs_start_transaction(root, 0);
2301 BUG_ON(IS_ERR(trans)); 2382 if (IS_ERR(trans)) {
2383 ret = PTR_ERR(trans);
2384 btrfs_std_error(root->fs_info, ret);
2385 return ret;
2386 }
2302 2387
2303 lock_chunks(root); 2388 lock_chunks(root);
2304 2389
@@ -2614,7 +2699,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2614 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 2699 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2615 chunk_used = btrfs_block_group_used(&cache->item); 2700 chunk_used = btrfs_block_group_used(&cache->item);
2616 2701
2617 user_thresh = div_factor_fine(cache->key.offset, bargs->usage); 2702 if (bargs->usage == 0)
2703 user_thresh = 1;
2704 else if (bargs->usage > 100)
2705 user_thresh = cache->key.offset;
2706 else
2707 user_thresh = div_factor_fine(cache->key.offset,
2708 bargs->usage);
2709
2618 if (chunk_used < user_thresh) 2710 if (chunk_used < user_thresh)
2619 ret = 0; 2711 ret = 0;
2620 2712
@@ -2656,11 +2748,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2656 return 0; 2748 return 0;
2657 2749
2658 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2750 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2659 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2751 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2660 factor = 2; 2752 factor = num_stripes / 2;
2661 else 2753 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2662 factor = 1; 2754 factor = num_stripes - 1;
2663 factor = num_stripes / factor; 2755 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2756 factor = num_stripes - 2;
2757 } else {
2758 factor = num_stripes;
2759 }
2664 2760
2665 for (i = 0; i < num_stripes; i++) { 2761 for (i = 0; i < num_stripes; i++) {
2666 stripe = btrfs_stripe_nr(chunk, i); 2762 stripe = btrfs_stripe_nr(chunk, i);
@@ -2958,7 +3054,10 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
2958 3054
2959 unset_balance_control(fs_info); 3055 unset_balance_control(fs_info);
2960 ret = del_balance_item(fs_info->tree_root); 3056 ret = del_balance_item(fs_info->tree_root);
2961 BUG_ON(ret); 3057 if (ret)
3058 btrfs_std_error(fs_info, ret);
3059
3060 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
2962} 3061}
2963 3062
2964void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, 3063void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -2975,6 +3074,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2975 int mixed = 0; 3074 int mixed = 0;
2976 int ret; 3075 int ret;
2977 u64 num_devices; 3076 u64 num_devices;
3077 unsigned seq;
2978 3078
2979 if (btrfs_fs_closing(fs_info) || 3079 if (btrfs_fs_closing(fs_info) ||
2980 atomic_read(&fs_info->balance_pause_req) || 3080 atomic_read(&fs_info->balance_pause_req) ||
@@ -3017,7 +3117,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3017 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3117 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3018 else 3118 else
3019 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3119 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3020 BTRFS_BLOCK_GROUP_RAID10); 3120 BTRFS_BLOCK_GROUP_RAID10 |
3121 BTRFS_BLOCK_GROUP_RAID5 |
3122 BTRFS_BLOCK_GROUP_RAID6);
3021 3123
3022 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3124 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3023 (!alloc_profile_is_valid(bctl->data.target, 1) || 3125 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3057,23 +3159,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3057 3159
3058 /* allow to reduce meta or sys integrity only if force set */ 3160 /* allow to reduce meta or sys integrity only if force set */
3059 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3161 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3060 BTRFS_BLOCK_GROUP_RAID10; 3162 BTRFS_BLOCK_GROUP_RAID10 |
3061 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3163 BTRFS_BLOCK_GROUP_RAID5 |
3062 (fs_info->avail_system_alloc_bits & allowed) && 3164 BTRFS_BLOCK_GROUP_RAID6;
3063 !(bctl->sys.target & allowed)) || 3165 do {
3064 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3166 seq = read_seqbegin(&fs_info->profiles_lock);
3065 (fs_info->avail_metadata_alloc_bits & allowed) && 3167
3066 !(bctl->meta.target & allowed))) { 3168 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3067 if (bctl->flags & BTRFS_BALANCE_FORCE) { 3169 (fs_info->avail_system_alloc_bits & allowed) &&
3068 printk(KERN_INFO "btrfs: force reducing metadata " 3170 !(bctl->sys.target & allowed)) ||
3069 "integrity\n"); 3171 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3070 } else { 3172 (fs_info->avail_metadata_alloc_bits & allowed) &&
3071 printk(KERN_ERR "btrfs: balance will reduce metadata " 3173 !(bctl->meta.target & allowed))) {
3072 "integrity, use force if you want this\n"); 3174 if (bctl->flags & BTRFS_BALANCE_FORCE) {
3073 ret = -EINVAL; 3175 printk(KERN_INFO "btrfs: force reducing metadata "
3074 goto out; 3176 "integrity\n");
3177 } else {
3178 printk(KERN_ERR "btrfs: balance will reduce metadata "
3179 "integrity, use force if you want this\n");
3180 ret = -EINVAL;
3181 goto out;
3182 }
3075 } 3183 }
3076 } 3184 } while (read_seqretry(&fs_info->profiles_lock, seq));
3077 3185
3078 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3186 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3079 int num_tolerated_disk_barrier_failures; 3187 int num_tolerated_disk_barrier_failures;
@@ -3117,6 +3225,11 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3117 mutex_lock(&fs_info->balance_mutex); 3225 mutex_lock(&fs_info->balance_mutex);
3118 atomic_dec(&fs_info->balance_running); 3226 atomic_dec(&fs_info->balance_running);
3119 3227
3228 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3229 fs_info->num_tolerated_disk_barrier_failures =
3230 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3231 }
3232
3120 if (bargs) { 3233 if (bargs) {
3121 memset(bargs, 0, sizeof(*bargs)); 3234 memset(bargs, 0, sizeof(*bargs));
3122 update_ioctl_balance_args(fs_info, 0, bargs); 3235 update_ioctl_balance_args(fs_info, 0, bargs);
@@ -3127,19 +3240,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3127 __cancel_balance(fs_info); 3240 __cancel_balance(fs_info);
3128 } 3241 }
3129 3242
3130 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3131 fs_info->num_tolerated_disk_barrier_failures =
3132 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3133 }
3134
3135 wake_up(&fs_info->balance_wait_q); 3243 wake_up(&fs_info->balance_wait_q);
3136 3244
3137 return ret; 3245 return ret;
3138out: 3246out:
3139 if (bctl->flags & BTRFS_BALANCE_RESUME) 3247 if (bctl->flags & BTRFS_BALANCE_RESUME)
3140 __cancel_balance(fs_info); 3248 __cancel_balance(fs_info);
3141 else 3249 else {
3142 kfree(bctl); 3250 kfree(bctl);
3251 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3252 }
3143 return ret; 3253 return ret;
3144} 3254}
3145 3255
@@ -3156,7 +3266,6 @@ static int balance_kthread(void *data)
3156 ret = btrfs_balance(fs_info->balance_ctl, NULL); 3266 ret = btrfs_balance(fs_info->balance_ctl, NULL);
3157 } 3267 }
3158 3268
3159 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3160 mutex_unlock(&fs_info->balance_mutex); 3269 mutex_unlock(&fs_info->balance_mutex);
3161 mutex_unlock(&fs_info->volume_mutex); 3270 mutex_unlock(&fs_info->volume_mutex);
3162 3271
@@ -3179,7 +3288,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3179 return 0; 3288 return 0;
3180 } 3289 }
3181 3290
3182 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3183 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); 3291 tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3184 if (IS_ERR(tsk)) 3292 if (IS_ERR(tsk))
3185 return PTR_ERR(tsk); 3293 return PTR_ERR(tsk);
@@ -3233,6 +3341,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3233 btrfs_balance_sys(leaf, item, &disk_bargs); 3341 btrfs_balance_sys(leaf, item, &disk_bargs);
3234 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); 3342 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3235 3343
3344 WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3345
3236 mutex_lock(&fs_info->volume_mutex); 3346 mutex_lock(&fs_info->volume_mutex);
3237 mutex_lock(&fs_info->balance_mutex); 3347 mutex_lock(&fs_info->balance_mutex);
3238 3348
@@ -3492,13 +3602,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3492} 3602}
3493 3603
3494struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3604struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3495 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3605 [BTRFS_RAID_RAID10] = {
3496 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3606 .sub_stripes = 2,
3497 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3607 .dev_stripes = 1,
3498 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3608 .devs_max = 0, /* 0 == as many as possible */
3499 { 1, 1, 0, 1, 1, 1 /* single */ }, 3609 .devs_min = 4,
3610 .devs_increment = 2,
3611 .ncopies = 2,
3612 },
3613 [BTRFS_RAID_RAID1] = {
3614 .sub_stripes = 1,
3615 .dev_stripes = 1,
3616 .devs_max = 2,
3617 .devs_min = 2,
3618 .devs_increment = 2,
3619 .ncopies = 2,
3620 },
3621 [BTRFS_RAID_DUP] = {
3622 .sub_stripes = 1,
3623 .dev_stripes = 2,
3624 .devs_max = 1,
3625 .devs_min = 1,
3626 .devs_increment = 1,
3627 .ncopies = 2,
3628 },
3629 [BTRFS_RAID_RAID0] = {
3630 .sub_stripes = 1,
3631 .dev_stripes = 1,
3632 .devs_max = 0,
3633 .devs_min = 2,
3634 .devs_increment = 1,
3635 .ncopies = 1,
3636 },
3637 [BTRFS_RAID_SINGLE] = {
3638 .sub_stripes = 1,
3639 .dev_stripes = 1,
3640 .devs_max = 1,
3641 .devs_min = 1,
3642 .devs_increment = 1,
3643 .ncopies = 1,
3644 },
3645 [BTRFS_RAID_RAID5] = {
3646 .sub_stripes = 1,
3647 .dev_stripes = 1,
3648 .devs_max = 0,
3649 .devs_min = 2,
3650 .devs_increment = 1,
3651 .ncopies = 2,
3652 },
3653 [BTRFS_RAID_RAID6] = {
3654 .sub_stripes = 1,
3655 .dev_stripes = 1,
3656 .devs_max = 0,
3657 .devs_min = 3,
3658 .devs_increment = 1,
3659 .ncopies = 3,
3660 },
3500}; 3661};
3501 3662
3663static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3664{
3665 /* TODO allow them to set a preferred stripe size */
3666 return 64 * 1024;
3667}
3668
3669static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3670{
3671 u64 features;
3672
3673 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3674 return;
3675
3676 features = btrfs_super_incompat_flags(info->super_copy);
3677 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3678 return;
3679
3680 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3681 btrfs_set_super_incompat_flags(info->super_copy, features);
3682 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3683}
3684
3502static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3685static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3503 struct btrfs_root *extent_root, 3686 struct btrfs_root *extent_root,
3504 struct map_lookup **map_ret, 3687 struct map_lookup **map_ret,
@@ -3514,6 +3697,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3514 struct btrfs_device_info *devices_info = NULL; 3697 struct btrfs_device_info *devices_info = NULL;
3515 u64 total_avail; 3698 u64 total_avail;
3516 int num_stripes; /* total number of stripes to allocate */ 3699 int num_stripes; /* total number of stripes to allocate */
3700 int data_stripes; /* number of stripes that count for
3701 block group size */
3517 int sub_stripes; /* sub_stripes info for map */ 3702 int sub_stripes; /* sub_stripes info for map */
3518 int dev_stripes; /* stripes per dev */ 3703 int dev_stripes; /* stripes per dev */
3519 int devs_max; /* max devs to use */ 3704 int devs_max; /* max devs to use */
@@ -3525,6 +3710,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3525 u64 max_chunk_size; 3710 u64 max_chunk_size;
3526 u64 stripe_size; 3711 u64 stripe_size;
3527 u64 num_bytes; 3712 u64 num_bytes;
3713 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3528 int ndevs; 3714 int ndevs;
3529 int i; 3715 int i;
3530 int j; 3716 int j;
@@ -3619,12 +3805,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3619 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) 3805 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3620 continue; 3806 continue;
3621 3807
3808 if (ndevs == fs_devices->rw_devices) {
3809 WARN(1, "%s: found more than %llu devices\n",
3810 __func__, fs_devices->rw_devices);
3811 break;
3812 }
3622 devices_info[ndevs].dev_offset = dev_offset; 3813 devices_info[ndevs].dev_offset = dev_offset;
3623 devices_info[ndevs].max_avail = max_avail; 3814 devices_info[ndevs].max_avail = max_avail;
3624 devices_info[ndevs].total_avail = total_avail; 3815 devices_info[ndevs].total_avail = total_avail;
3625 devices_info[ndevs].dev = device; 3816 devices_info[ndevs].dev = device;
3626 ++ndevs; 3817 ++ndevs;
3627 WARN_ON(ndevs > fs_devices->rw_devices);
3628 } 3818 }
3629 3819
3630 /* 3820 /*
@@ -3650,16 +3840,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3650 stripe_size = devices_info[ndevs-1].max_avail; 3840 stripe_size = devices_info[ndevs-1].max_avail;
3651 num_stripes = ndevs * dev_stripes; 3841 num_stripes = ndevs * dev_stripes;
3652 3842
3653 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3843 /*
3654 stripe_size = max_chunk_size * ncopies; 3844 * this will have to be fixed for RAID1 and RAID10 over
3655 do_div(stripe_size, ndevs); 3845 * more drives
3846 */
3847 data_stripes = num_stripes / ncopies;
3848
3849 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3850 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3851 btrfs_super_stripesize(info->super_copy));
3852 data_stripes = num_stripes - 1;
3853 }
3854 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3855 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3856 btrfs_super_stripesize(info->super_copy));
3857 data_stripes = num_stripes - 2;
3858 }
3859
3860 /*
3861 * Use the number of data stripes to figure out how big this chunk
3862 * is really going to be in terms of logical address space,
3863 * and compare that answer with the max chunk size
3864 */
3865 if (stripe_size * data_stripes > max_chunk_size) {
3866 u64 mask = (1ULL << 24) - 1;
3867 stripe_size = max_chunk_size;
3868 do_div(stripe_size, data_stripes);
3869
3870 /* bump the answer up to a 16MB boundary */
3871 stripe_size = (stripe_size + mask) & ~mask;
3872
3873 /* but don't go higher than the limits we found
3874 * while searching for free extents
3875 */
3876 if (stripe_size > devices_info[ndevs-1].max_avail)
3877 stripe_size = devices_info[ndevs-1].max_avail;
3656 } 3878 }
3657 3879
3658 do_div(stripe_size, dev_stripes); 3880 do_div(stripe_size, dev_stripes);
3659 3881
3660 /* align to BTRFS_STRIPE_LEN */ 3882 /* align to BTRFS_STRIPE_LEN */
3661 do_div(stripe_size, BTRFS_STRIPE_LEN); 3883 do_div(stripe_size, raid_stripe_len);
3662 stripe_size *= BTRFS_STRIPE_LEN; 3884 stripe_size *= raid_stripe_len;
3663 3885
3664 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3886 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3665 if (!map) { 3887 if (!map) {
@@ -3677,14 +3899,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3677 } 3899 }
3678 } 3900 }
3679 map->sector_size = extent_root->sectorsize; 3901 map->sector_size = extent_root->sectorsize;
3680 map->stripe_len = BTRFS_STRIPE_LEN; 3902 map->stripe_len = raid_stripe_len;
3681 map->io_align = BTRFS_STRIPE_LEN; 3903 map->io_align = raid_stripe_len;
3682 map->io_width = BTRFS_STRIPE_LEN; 3904 map->io_width = raid_stripe_len;
3683 map->type = type; 3905 map->type = type;
3684 map->sub_stripes = sub_stripes; 3906 map->sub_stripes = sub_stripes;
3685 3907
3686 *map_ret = map; 3908 *map_ret = map;
3687 num_bytes = stripe_size * (num_stripes / ncopies); 3909 num_bytes = stripe_size * data_stripes;
3688 3910
3689 *stripe_size_out = stripe_size; 3911 *stripe_size_out = stripe_size;
3690 *num_bytes_out = num_bytes; 3912 *num_bytes_out = num_bytes;
@@ -3706,15 +3928,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3706 write_lock(&em_tree->lock); 3928 write_lock(&em_tree->lock);
3707 ret = add_extent_mapping(em_tree, em); 3929 ret = add_extent_mapping(em_tree, em);
3708 write_unlock(&em_tree->lock); 3930 write_unlock(&em_tree->lock);
3709 free_extent_map(em); 3931 if (ret) {
3710 if (ret) 3932 free_extent_map(em);
3711 goto error;
3712
3713 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3714 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3715 start, num_bytes);
3716 if (ret)
3717 goto error; 3933 goto error;
3934 }
3718 3935
3719 for (i = 0; i < map->num_stripes; ++i) { 3936 for (i = 0; i < map->num_stripes; ++i) {
3720 struct btrfs_device *device; 3937 struct btrfs_device *device;
@@ -3727,15 +3944,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3727 info->chunk_root->root_key.objectid, 3944 info->chunk_root->root_key.objectid,
3728 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 3945 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3729 start, dev_offset, stripe_size); 3946 start, dev_offset, stripe_size);
3730 if (ret) { 3947 if (ret)
3731 btrfs_abort_transaction(trans, extent_root, ret); 3948 goto error_dev_extent;
3732 goto error;
3733 }
3734 } 3949 }
3735 3950
3951 ret = btrfs_make_block_group(trans, extent_root, 0, type,
3952 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3953 start, num_bytes);
3954 if (ret) {
3955 i = map->num_stripes - 1;
3956 goto error_dev_extent;
3957 }
3958
3959 free_extent_map(em);
3960 check_raid56_incompat_flag(extent_root->fs_info, type);
3961
3736 kfree(devices_info); 3962 kfree(devices_info);
3737 return 0; 3963 return 0;
3738 3964
3965error_dev_extent:
3966 for (; i >= 0; i--) {
3967 struct btrfs_device *device;
3968 int err;
3969
3970 device = map->stripes[i].dev;
3971 err = btrfs_free_dev_extent(trans, device, start);
3972 if (err) {
3973 btrfs_abort_transaction(trans, extent_root, err);
3974 break;
3975 }
3976 }
3977 write_lock(&em_tree->lock);
3978 remove_extent_mapping(em_tree, em);
3979 write_unlock(&em_tree->lock);
3980
3981 /* One for our allocation */
3982 free_extent_map(em);
3983 /* One for the tree reference */
3984 free_extent_map(em);
3739error: 3985error:
3740 kfree(map); 3986 kfree(map);
3741 kfree(devices_info); 3987 kfree(devices_info);
@@ -3875,10 +4121,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3875 if (ret) 4121 if (ret)
3876 return ret; 4122 return ret;
3877 4123
3878 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 4124 alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
3879 fs_info->avail_metadata_alloc_bits;
3880 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3881
3882 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 4125 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3883 &stripe_size, chunk_offset, alloc_profile); 4126 &stripe_size, chunk_offset, alloc_profile);
3884 if (ret) 4127 if (ret)
@@ -3886,10 +4129,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3886 4129
3887 sys_chunk_offset = chunk_offset + chunk_size; 4130 sys_chunk_offset = chunk_offset + chunk_size;
3888 4131
3889 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 4132 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
3890 fs_info->avail_system_alloc_bits;
3891 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3892
3893 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 4133 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3894 &sys_chunk_size, &sys_stripe_size, 4134 &sys_chunk_size, &sys_stripe_size,
3895 sys_chunk_offset, alloc_profile); 4135 sys_chunk_offset, alloc_profile);
@@ -4002,6 +4242,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4002 ret = map->num_stripes; 4242 ret = map->num_stripes;
4003 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4243 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4004 ret = map->sub_stripes; 4244 ret = map->sub_stripes;
4245 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4246 ret = 2;
4247 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4248 ret = 3;
4005 else 4249 else
4006 ret = 1; 4250 ret = 1;
4007 free_extent_map(em); 4251 free_extent_map(em);
@@ -4014,6 +4258,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4014 return ret; 4258 return ret;
4015} 4259}
4016 4260
4261unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4262 struct btrfs_mapping_tree *map_tree,
4263 u64 logical)
4264{
4265 struct extent_map *em;
4266 struct map_lookup *map;
4267 struct extent_map_tree *em_tree = &map_tree->map_tree;
4268 unsigned long len = root->sectorsize;
4269
4270 read_lock(&em_tree->lock);
4271 em = lookup_extent_mapping(em_tree, logical, len);
4272 read_unlock(&em_tree->lock);
4273 BUG_ON(!em);
4274
4275 BUG_ON(em->start > logical || em->start + em->len < logical);
4276 map = (struct map_lookup *)em->bdev;
4277 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4278 BTRFS_BLOCK_GROUP_RAID6)) {
4279 len = map->stripe_len * nr_data_stripes(map);
4280 }
4281 free_extent_map(em);
4282 return len;
4283}
4284
4285int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4286 u64 logical, u64 len, int mirror_num)
4287{
4288 struct extent_map *em;
4289 struct map_lookup *map;
4290 struct extent_map_tree *em_tree = &map_tree->map_tree;
4291 int ret = 0;
4292
4293 read_lock(&em_tree->lock);
4294 em = lookup_extent_mapping(em_tree, logical, len);
4295 read_unlock(&em_tree->lock);
4296 BUG_ON(!em);
4297
4298 BUG_ON(em->start > logical || em->start + em->len < logical);
4299 map = (struct map_lookup *)em->bdev;
4300 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4301 BTRFS_BLOCK_GROUP_RAID6))
4302 ret = 1;
4303 free_extent_map(em);
4304 return ret;
4305}
4306
4017static int find_live_mirror(struct btrfs_fs_info *fs_info, 4307static int find_live_mirror(struct btrfs_fs_info *fs_info,
4018 struct map_lookup *map, int first, int num, 4308 struct map_lookup *map, int first, int num,
4019 int optimal, int dev_replace_is_ongoing) 4309 int optimal, int dev_replace_is_ongoing)
@@ -4051,10 +4341,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4051 return optimal; 4341 return optimal;
4052} 4342}
4053 4343
4344static inline int parity_smaller(u64 a, u64 b)
4345{
4346 return a > b;
4347}
4348
4349/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4350static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4351{
4352 struct btrfs_bio_stripe s;
4353 int i;
4354 u64 l;
4355 int again = 1;
4356
4357 while (again) {
4358 again = 0;
4359 for (i = 0; i < bbio->num_stripes - 1; i++) {
4360 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4361 s = bbio->stripes[i];
4362 l = raid_map[i];
4363 bbio->stripes[i] = bbio->stripes[i+1];
4364 raid_map[i] = raid_map[i+1];
4365 bbio->stripes[i+1] = s;
4366 raid_map[i+1] = l;
4367 again = 1;
4368 }
4369 }
4370 }
4371}
4372
4054static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4373static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4055 u64 logical, u64 *length, 4374 u64 logical, u64 *length,
4056 struct btrfs_bio **bbio_ret, 4375 struct btrfs_bio **bbio_ret,
4057 int mirror_num) 4376 int mirror_num, u64 **raid_map_ret)
4058{ 4377{
4059 struct extent_map *em; 4378 struct extent_map *em;
4060 struct map_lookup *map; 4379 struct map_lookup *map;
@@ -4066,6 +4385,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4066 u64 stripe_nr; 4385 u64 stripe_nr;
4067 u64 stripe_nr_orig; 4386 u64 stripe_nr_orig;
4068 u64 stripe_nr_end; 4387 u64 stripe_nr_end;
4388 u64 stripe_len;
4389 u64 *raid_map = NULL;
4069 int stripe_index; 4390 int stripe_index;
4070 int i; 4391 int i;
4071 int ret = 0; 4392 int ret = 0;
@@ -4077,6 +4398,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4077 int num_alloc_stripes; 4398 int num_alloc_stripes;
4078 int patch_the_first_stripe_for_dev_replace = 0; 4399 int patch_the_first_stripe_for_dev_replace = 0;
4079 u64 physical_to_patch_in_first_stripe = 0; 4400 u64 physical_to_patch_in_first_stripe = 0;
4401 u64 raid56_full_stripe_start = (u64)-1;
4080 4402
4081 read_lock(&em_tree->lock); 4403 read_lock(&em_tree->lock);
4082 em = lookup_extent_mapping(em_tree, logical, *length); 4404 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4093,29 +4415,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4093 map = (struct map_lookup *)em->bdev; 4415 map = (struct map_lookup *)em->bdev;
4094 offset = logical - em->start; 4416 offset = logical - em->start;
4095 4417
4418 if (mirror_num > map->num_stripes)
4419 mirror_num = 0;
4420
4421 stripe_len = map->stripe_len;
4096 stripe_nr = offset; 4422 stripe_nr = offset;
4097 /* 4423 /*
4098 * stripe_nr counts the total number of stripes we have to stride 4424 * stripe_nr counts the total number of stripes we have to stride
4099 * to get to this block 4425 * to get to this block
4100 */ 4426 */
4101 do_div(stripe_nr, map->stripe_len); 4427 do_div(stripe_nr, stripe_len);
4102 4428
4103 stripe_offset = stripe_nr * map->stripe_len; 4429 stripe_offset = stripe_nr * stripe_len;
4104 BUG_ON(offset < stripe_offset); 4430 BUG_ON(offset < stripe_offset);
4105 4431
4106 /* stripe_offset is the offset of this block in its stripe*/ 4432 /* stripe_offset is the offset of this block in its stripe*/
4107 stripe_offset = offset - stripe_offset; 4433 stripe_offset = offset - stripe_offset;
4108 4434
4109 if (rw & REQ_DISCARD) 4435 /* if we're here for raid56, we need to know the stripe aligned start */
4436 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4437 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4438 raid56_full_stripe_start = offset;
4439
4440 /* allow a write of a full stripe, but make sure we don't
4441 * allow straddling of stripes
4442 */
4443 do_div(raid56_full_stripe_start, full_stripe_len);
4444 raid56_full_stripe_start *= full_stripe_len;
4445 }
4446
4447 if (rw & REQ_DISCARD) {
4448 /* we don't discard raid56 yet */
4449 if (map->type &
4450 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4451 ret = -EOPNOTSUPP;
4452 goto out;
4453 }
4110 *length = min_t(u64, em->len - offset, *length); 4454 *length = min_t(u64, em->len - offset, *length);
4111 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4455 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4112 /* we limit the length of each bio to what fits in a stripe */ 4456 u64 max_len;
4113 *length = min_t(u64, em->len - offset, 4457 /* For writes to RAID[56], allow a full stripeset across all disks.
4114 map->stripe_len - stripe_offset); 4458 For other RAID types and for RAID[56] reads, just allow a single
4459 stripe (on a single disk). */
4460 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4461 (rw & REQ_WRITE)) {
4462 max_len = stripe_len * nr_data_stripes(map) -
4463 (offset - raid56_full_stripe_start);
4464 } else {
4465 /* we limit the length of each bio to what fits in a stripe */
4466 max_len = stripe_len - stripe_offset;
4467 }
4468 *length = min_t(u64, em->len - offset, max_len);
4115 } else { 4469 } else {
4116 *length = em->len - offset; 4470 *length = em->len - offset;
4117 } 4471 }
4118 4472
4473 /* This is for when we're called from btrfs_merge_bio_hook() and all
4474 it cares about is the length */
4119 if (!bbio_ret) 4475 if (!bbio_ret)
4120 goto out; 4476 goto out;
4121 4477
@@ -4148,7 +4504,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4148 u64 physical_of_found = 0; 4504 u64 physical_of_found = 0;
4149 4505
4150 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4506 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4151 logical, &tmp_length, &tmp_bbio, 0); 4507 logical, &tmp_length, &tmp_bbio, 0, NULL);
4152 if (ret) { 4508 if (ret) {
4153 WARN_ON(tmp_bbio != NULL); 4509 WARN_ON(tmp_bbio != NULL);
4154 goto out; 4510 goto out;
@@ -4209,11 +4565,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4209 num_stripes = 1; 4565 num_stripes = 1;
4210 stripe_index = 0; 4566 stripe_index = 0;
4211 stripe_nr_orig = stripe_nr; 4567 stripe_nr_orig = stripe_nr;
4212 stripe_nr_end = (offset + *length + map->stripe_len - 1) & 4568 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
4213 (~(map->stripe_len - 1));
4214 do_div(stripe_nr_end, map->stripe_len); 4569 do_div(stripe_nr_end, map->stripe_len);
4215 stripe_end_offset = stripe_nr_end * map->stripe_len - 4570 stripe_end_offset = stripe_nr_end * map->stripe_len -
4216 (offset + *length); 4571 (offset + *length);
4572
4217 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4573 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4218 if (rw & REQ_DISCARD) 4574 if (rw & REQ_DISCARD)
4219 num_stripes = min_t(u64, map->num_stripes, 4575 num_stripes = min_t(u64, map->num_stripes,
@@ -4264,6 +4620,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4264 dev_replace_is_ongoing); 4620 dev_replace_is_ongoing);
4265 mirror_num = stripe_index - old_stripe_index + 1; 4621 mirror_num = stripe_index - old_stripe_index + 1;
4266 } 4622 }
4623
4624 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4625 BTRFS_BLOCK_GROUP_RAID6)) {
4626 u64 tmp;
4627
4628 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4629 && raid_map_ret) {
4630 int i, rot;
4631
4632 /* push stripe_nr back to the start of the full stripe */
4633 stripe_nr = raid56_full_stripe_start;
4634 do_div(stripe_nr, stripe_len);
4635
4636 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4637
4638 /* RAID[56] write or recovery. Return all stripes */
4639 num_stripes = map->num_stripes;
4640 max_errors = nr_parity_stripes(map);
4641
4642 raid_map = kmalloc(sizeof(u64) * num_stripes,
4643 GFP_NOFS);
4644 if (!raid_map) {
4645 ret = -ENOMEM;
4646 goto out;
4647 }
4648
4649 /* Work out the disk rotation on this stripe-set */
4650 tmp = stripe_nr;
4651 rot = do_div(tmp, num_stripes);
4652
4653 /* Fill in the logical address of each stripe */
4654 tmp = stripe_nr * nr_data_stripes(map);
4655 for (i = 0; i < nr_data_stripes(map); i++)
4656 raid_map[(i+rot) % num_stripes] =
4657 em->start + (tmp + i) * map->stripe_len;
4658
4659 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4660 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4661 raid_map[(i+rot+1) % num_stripes] =
4662 RAID6_Q_STRIPE;
4663
4664 *length = map->stripe_len;
4665 stripe_index = 0;
4666 stripe_offset = 0;
4667 } else {
4668 /*
4669 * Mirror #0 or #1 means the original data block.
4670 * Mirror #2 is RAID5 parity block.
4671 * Mirror #3 is RAID6 Q block.
4672 */
4673 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4674 if (mirror_num > 1)
4675 stripe_index = nr_data_stripes(map) +
4676 mirror_num - 2;
4677
4678 /* We distribute the parity blocks across stripes */
4679 tmp = stripe_nr + stripe_index;
4680 stripe_index = do_div(tmp, map->num_stripes);
4681 }
4267 } else { 4682 } else {
4268 /* 4683 /*
4269 * after this do_div call, stripe_nr is the number of stripes 4684 * after this do_div call, stripe_nr is the number of stripes
@@ -4372,8 +4787,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4372 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4787 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4373 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4788 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4374 BTRFS_BLOCK_GROUP_RAID10 | 4789 BTRFS_BLOCK_GROUP_RAID10 |
4790 BTRFS_BLOCK_GROUP_RAID5 |
4375 BTRFS_BLOCK_GROUP_DUP)) { 4791 BTRFS_BLOCK_GROUP_DUP)) {
4376 max_errors = 1; 4792 max_errors = 1;
4793 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4794 max_errors = 2;
4377 } 4795 }
4378 } 4796 }
4379 4797
@@ -4474,6 +4892,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4474 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4892 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4475 bbio->mirror_num = map->num_stripes + 1; 4893 bbio->mirror_num = map->num_stripes + 1;
4476 } 4894 }
4895 if (raid_map) {
4896 sort_parity_stripes(bbio, raid_map);
4897 *raid_map_ret = raid_map;
4898 }
4477out: 4899out:
4478 if (dev_replace_is_ongoing) 4900 if (dev_replace_is_ongoing)
4479 btrfs_dev_replace_unlock(dev_replace); 4901 btrfs_dev_replace_unlock(dev_replace);
@@ -4486,7 +4908,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4486 struct btrfs_bio **bbio_ret, int mirror_num) 4908 struct btrfs_bio **bbio_ret, int mirror_num)
4487{ 4909{
4488 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4910 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4489 mirror_num); 4911 mirror_num, NULL);
4490} 4912}
4491 4913
4492int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4914int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4500,6 +4922,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4500 u64 bytenr; 4922 u64 bytenr;
4501 u64 length; 4923 u64 length;
4502 u64 stripe_nr; 4924 u64 stripe_nr;
4925 u64 rmap_len;
4503 int i, j, nr = 0; 4926 int i, j, nr = 0;
4504 4927
4505 read_lock(&em_tree->lock); 4928 read_lock(&em_tree->lock);
@@ -4510,10 +4933,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4510 map = (struct map_lookup *)em->bdev; 4933 map = (struct map_lookup *)em->bdev;
4511 4934
4512 length = em->len; 4935 length = em->len;
4936 rmap_len = map->stripe_len;
4937
4513 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4938 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4514 do_div(length, map->num_stripes / map->sub_stripes); 4939 do_div(length, map->num_stripes / map->sub_stripes);
4515 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4940 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4516 do_div(length, map->num_stripes); 4941 do_div(length, map->num_stripes);
4942 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4943 BTRFS_BLOCK_GROUP_RAID6)) {
4944 do_div(length, nr_data_stripes(map));
4945 rmap_len = map->stripe_len * nr_data_stripes(map);
4946 }
4517 4947
4518 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4948 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4519 BUG_ON(!buf); /* -ENOMEM */ 4949 BUG_ON(!buf); /* -ENOMEM */
@@ -4533,8 +4963,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4533 do_div(stripe_nr, map->sub_stripes); 4963 do_div(stripe_nr, map->sub_stripes);
4534 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4964 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4535 stripe_nr = stripe_nr * map->num_stripes + i; 4965 stripe_nr = stripe_nr * map->num_stripes + i;
4536 } 4966 } /* else if RAID[56], multiply by nr_data_stripes().
4537 bytenr = chunk_start + stripe_nr * map->stripe_len; 4967 * Alternatively, just use rmap_len below instead of
4968 * map->stripe_len */
4969
4970 bytenr = chunk_start + stripe_nr * rmap_len;
4538 WARN_ON(nr >= map->num_stripes); 4971 WARN_ON(nr >= map->num_stripes);
4539 for (j = 0; j < nr; j++) { 4972 for (j = 0; j < nr; j++) {
4540 if (buf[j] == bytenr) 4973 if (buf[j] == bytenr)
@@ -4548,7 +4981,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4548 4981
4549 *logical = buf; 4982 *logical = buf;
4550 *naddrs = nr; 4983 *naddrs = nr;
4551 *stripe_len = map->stripe_len; 4984 *stripe_len = rmap_len;
4552 4985
4553 free_extent_map(em); 4986 free_extent_map(em);
4554 return 0; 4987 return 0;
@@ -4622,7 +5055,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4622 bio->bi_bdev = (struct block_device *) 5055 bio->bi_bdev = (struct block_device *)
4623 (unsigned long)bbio->mirror_num; 5056 (unsigned long)bbio->mirror_num;
4624 /* only send an error to the higher layers if it is 5057 /* only send an error to the higher layers if it is
4625 * beyond the tolerance of the multi-bio 5058 * beyond the tolerance of the btrfs bio
4626 */ 5059 */
4627 if (atomic_read(&bbio->error) > bbio->max_errors) { 5060 if (atomic_read(&bbio->error) > bbio->max_errors) {
4628 err = -EIO; 5061 err = -EIO;
@@ -4656,13 +5089,18 @@ struct async_sched {
4656 * This will add one bio to the pending list for a device and make sure 5089 * This will add one bio to the pending list for a device and make sure
4657 * the work struct is scheduled. 5090 * the work struct is scheduled.
4658 */ 5091 */
4659static noinline void schedule_bio(struct btrfs_root *root, 5092noinline void btrfs_schedule_bio(struct btrfs_root *root,
4660 struct btrfs_device *device, 5093 struct btrfs_device *device,
4661 int rw, struct bio *bio) 5094 int rw, struct bio *bio)
4662{ 5095{
4663 int should_queue = 1; 5096 int should_queue = 1;
4664 struct btrfs_pending_bios *pending_bios; 5097 struct btrfs_pending_bios *pending_bios;
4665 5098
5099 if (device->missing || !device->bdev) {
5100 bio_endio(bio, -EIO);
5101 return;
5102 }
5103
4666 /* don't bother with additional async steps for reads, right now */ 5104 /* don't bother with additional async steps for reads, right now */
4667 if (!(rw & REQ_WRITE)) { 5105 if (!(rw & REQ_WRITE)) {
4668 bio_get(bio); 5106 bio_get(bio);
@@ -4760,7 +5198,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4760#endif 5198#endif
4761 bio->bi_bdev = dev->bdev; 5199 bio->bi_bdev = dev->bdev;
4762 if (async) 5200 if (async)
4763 schedule_bio(root, dev, rw, bio); 5201 btrfs_schedule_bio(root, dev, rw, bio);
4764 else 5202 else
4765 btrfsic_submit_bio(rw, bio); 5203 btrfsic_submit_bio(rw, bio);
4766} 5204}
@@ -4819,6 +5257,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4819 u64 logical = (u64)bio->bi_sector << 9; 5257 u64 logical = (u64)bio->bi_sector << 9;
4820 u64 length = 0; 5258 u64 length = 0;
4821 u64 map_length; 5259 u64 map_length;
5260 u64 *raid_map = NULL;
4822 int ret; 5261 int ret;
4823 int dev_nr = 0; 5262 int dev_nr = 0;
4824 int total_devs = 1; 5263 int total_devs = 1;
@@ -4827,12 +5266,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4827 length = bio->bi_size; 5266 length = bio->bi_size;
4828 map_length = length; 5267 map_length = length;
4829 5268
4830 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5269 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4831 mirror_num); 5270 mirror_num, &raid_map);
4832 if (ret) 5271 if (ret) /* -ENOMEM */
4833 return ret; 5272 return ret;
4834 5273
4835 total_devs = bbio->num_stripes; 5274 total_devs = bbio->num_stripes;
5275 bbio->orig_bio = first_bio;
5276 bbio->private = first_bio->bi_private;
5277 bbio->end_io = first_bio->bi_end_io;
5278 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5279
5280 if (raid_map) {
5281 /* In this case, map_length has been set to the length of
5282 a single stripe; not the whole write */
5283 if (rw & WRITE) {
5284 return raid56_parity_write(root, bio, bbio,
5285 raid_map, map_length);
5286 } else {
5287 return raid56_parity_recover(root, bio, bbio,
5288 raid_map, map_length,
5289 mirror_num);
5290 }
5291 }
5292
4836 if (map_length < length) { 5293 if (map_length < length) {
4837 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5294 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4838 "len %llu\n", (unsigned long long)logical, 5295 "len %llu\n", (unsigned long long)logical,
@@ -4841,11 +5298,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4841 BUG(); 5298 BUG();
4842 } 5299 }
4843 5300
4844 bbio->orig_bio = first_bio;
4845 bbio->private = first_bio->bi_private;
4846 bbio->end_io = first_bio->bi_end_io;
4847 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4848
4849 while (dev_nr < total_devs) { 5301 while (dev_nr < total_devs) {
4850 dev = bbio->stripes[dev_nr].dev; 5302 dev = bbio->stripes[dev_nr].dev;
4851 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5303 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {