diff options
Diffstat (limited to 'fs/btrfs/volumes.c')
-rw-r--r-- | fs/btrfs/volumes.c | 636 |
1 files changed, 533 insertions, 103 deletions
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cbb7f4b1672..35bb2d4ed29f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <linux/raid/pq.h> | ||
29 | #include <asm/div64.h> | ||
28 | #include "compat.h" | 30 | #include "compat.h" |
29 | #include "ctree.h" | 31 | #include "ctree.h" |
30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
@@ -32,6 +34,7 @@ | |||
32 | #include "transaction.h" | 34 | #include "transaction.h" |
33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
34 | #include "volumes.h" | 36 | #include "volumes.h" |
37 | #include "raid56.h" | ||
35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | |||
647 | new_device->writeable = 0; | 650 | new_device->writeable = 0; |
648 | new_device->in_fs_metadata = 0; | 651 | new_device->in_fs_metadata = 0; |
649 | new_device->can_discard = 0; | 652 | new_device->can_discard = 0; |
653 | spin_lock_init(&new_device->io_lock); | ||
650 | list_replace_rcu(&device->dev_list, &new_device->dev_list); | 654 | list_replace_rcu(&device->dev_list, &new_device->dev_list); |
651 | 655 | ||
652 | call_rcu(&device->rcu, free_device); | 656 | call_rcu(&device->rcu, free_device); |
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
792 | return ret; | 796 | return ret; |
793 | } | 797 | } |
794 | 798 | ||
799 | /* | ||
800 | * Look for a btrfs signature on a device. This may be called out of the mount path | ||
801 | * and we are not allowed to call set_blocksize during the scan. The superblock | ||
802 | * is read via pagecache | ||
803 | */ | ||
795 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | 804 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, |
796 | struct btrfs_fs_devices **fs_devices_ret) | 805 | struct btrfs_fs_devices **fs_devices_ret) |
797 | { | 806 | { |
798 | struct btrfs_super_block *disk_super; | 807 | struct btrfs_super_block *disk_super; |
799 | struct block_device *bdev; | 808 | struct block_device *bdev; |
800 | struct buffer_head *bh; | 809 | struct page *page; |
801 | int ret; | 810 | void *p; |
811 | int ret = -EINVAL; | ||
802 | u64 devid; | 812 | u64 devid; |
803 | u64 transid; | 813 | u64 transid; |
804 | u64 total_devices; | 814 | u64 total_devices; |
815 | u64 bytenr; | ||
816 | pgoff_t index; | ||
805 | 817 | ||
818 | /* | ||
819 | * we would like to check all the supers, but that would make | ||
820 | * a btrfs mount succeed after a mkfs from a different FS. | ||
821 | * So, we need to add a special mount option to scan for | ||
822 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead | ||
823 | */ | ||
824 | bytenr = btrfs_sb_offset(0); | ||
806 | flags |= FMODE_EXCL; | 825 | flags |= FMODE_EXCL; |
807 | mutex_lock(&uuid_mutex); | 826 | mutex_lock(&uuid_mutex); |
808 | ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); | 827 | |
809 | if (ret) | 828 | bdev = blkdev_get_by_path(path, flags, holder); |
829 | |||
830 | if (IS_ERR(bdev)) { | ||
831 | ret = PTR_ERR(bdev); | ||
810 | goto error; | 832 | goto error; |
811 | disk_super = (struct btrfs_super_block *)bh->b_data; | 833 | } |
834 | |||
835 | /* make sure our super fits in the device */ | ||
836 | if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) | ||
837 | goto error_bdev_put; | ||
838 | |||
839 | /* make sure our super fits in the page */ | ||
840 | if (sizeof(*disk_super) > PAGE_CACHE_SIZE) | ||
841 | goto error_bdev_put; | ||
842 | |||
843 | /* make sure our super doesn't straddle pages on disk */ | ||
844 | index = bytenr >> PAGE_CACHE_SHIFT; | ||
845 | if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) | ||
846 | goto error_bdev_put; | ||
847 | |||
848 | /* pull in the page with our super */ | ||
849 | page = read_cache_page_gfp(bdev->bd_inode->i_mapping, | ||
850 | index, GFP_NOFS); | ||
851 | |||
852 | if (IS_ERR_OR_NULL(page)) | ||
853 | goto error_bdev_put; | ||
854 | |||
855 | p = kmap(page); | ||
856 | |||
857 | /* align our pointer to the offset of the super block */ | ||
858 | disk_super = p + (bytenr & ~PAGE_CACHE_MASK); | ||
859 | |||
860 | if (btrfs_super_bytenr(disk_super) != bytenr || | ||
861 | disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) | ||
862 | goto error_unmap; | ||
863 | |||
812 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 864 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
813 | transid = btrfs_super_generation(disk_super); | 865 | transid = btrfs_super_generation(disk_super); |
814 | total_devices = btrfs_super_num_devices(disk_super); | 866 | total_devices = btrfs_super_num_devices(disk_super); |
867 | |||
815 | if (disk_super->label[0]) { | 868 | if (disk_super->label[0]) { |
816 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) | 869 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) |
817 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; | 870 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; |
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
819 | } else { | 872 | } else { |
820 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); | 873 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); |
821 | } | 874 | } |
875 | |||
822 | printk(KERN_CONT "devid %llu transid %llu %s\n", | 876 | printk(KERN_CONT "devid %llu transid %llu %s\n", |
823 | (unsigned long long)devid, (unsigned long long)transid, path); | 877 | (unsigned long long)devid, (unsigned long long)transid, path); |
878 | |||
824 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | 879 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); |
825 | if (!ret && fs_devices_ret) | 880 | if (!ret && fs_devices_ret) |
826 | (*fs_devices_ret)->total_devices = total_devices; | 881 | (*fs_devices_ret)->total_devices = total_devices; |
827 | brelse(bh); | 882 | |
883 | error_unmap: | ||
884 | kunmap(page); | ||
885 | page_cache_release(page); | ||
886 | |||
887 | error_bdev_put: | ||
828 | blkdev_put(bdev, flags); | 888 | blkdev_put(bdev, flags); |
829 | error: | 889 | error: |
830 | mutex_unlock(&uuid_mutex); | 890 | mutex_unlock(&uuid_mutex); |
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1372 | u64 devid; | 1432 | u64 devid; |
1373 | u64 num_devices; | 1433 | u64 num_devices; |
1374 | u8 *dev_uuid; | 1434 | u8 *dev_uuid; |
1435 | unsigned seq; | ||
1375 | int ret = 0; | 1436 | int ret = 0; |
1376 | bool clear_super = false; | 1437 | bool clear_super = false; |
1377 | 1438 | ||
1378 | mutex_lock(&uuid_mutex); | 1439 | mutex_lock(&uuid_mutex); |
1379 | 1440 | ||
1380 | all_avail = root->fs_info->avail_data_alloc_bits | | 1441 | do { |
1381 | root->fs_info->avail_system_alloc_bits | | 1442 | seq = read_seqbegin(&root->fs_info->profiles_lock); |
1382 | root->fs_info->avail_metadata_alloc_bits; | 1443 | |
1444 | all_avail = root->fs_info->avail_data_alloc_bits | | ||
1445 | root->fs_info->avail_system_alloc_bits | | ||
1446 | root->fs_info->avail_metadata_alloc_bits; | ||
1447 | } while (read_seqretry(&root->fs_info->profiles_lock, seq)); | ||
1383 | 1448 | ||
1384 | num_devices = root->fs_info->fs_devices->num_devices; | 1449 | num_devices = root->fs_info->fs_devices->num_devices; |
1385 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); | 1450 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); |
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1403 | goto out; | 1468 | goto out; |
1404 | } | 1469 | } |
1405 | 1470 | ||
1471 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
1472 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1473 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1474 | "devices on raid5\n"); | ||
1475 | ret = -EINVAL; | ||
1476 | goto out; | ||
1477 | } | ||
1478 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
1479 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
1480 | printk(KERN_ERR "btrfs: unable to go below three " | ||
1481 | "devices on raid6\n"); | ||
1482 | ret = -EINVAL; | ||
1483 | goto out; | ||
1484 | } | ||
1485 | |||
1406 | if (strcmp(device_path, "missing") == 0) { | 1486 | if (strcmp(device_path, "missing") == 0) { |
1407 | struct list_head *devices; | 1487 | struct list_head *devices; |
1408 | struct btrfs_device *tmp; | 1488 | struct btrfs_device *tmp; |
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, | |||
2616 | chunk_used = btrfs_block_group_used(&cache->item); | 2696 | chunk_used = btrfs_block_group_used(&cache->item); |
2617 | 2697 | ||
2618 | if (bargs->usage == 0) | 2698 | if (bargs->usage == 0) |
2619 | user_thresh = 0; | 2699 | user_thresh = 1; |
2620 | else if (bargs->usage > 100) | 2700 | else if (bargs->usage > 100) |
2621 | user_thresh = cache->key.offset; | 2701 | user_thresh = cache->key.offset; |
2622 | else | 2702 | else |
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
2664 | return 0; | 2744 | return 0; |
2665 | 2745 | ||
2666 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2746 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
2667 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2747 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
2668 | factor = 2; | 2748 | factor = num_stripes / 2; |
2669 | else | 2749 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
2670 | factor = 1; | 2750 | factor = num_stripes - 1; |
2671 | factor = num_stripes / factor; | 2751 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
2752 | factor = num_stripes - 2; | ||
2753 | } else { | ||
2754 | factor = num_stripes; | ||
2755 | } | ||
2672 | 2756 | ||
2673 | for (i = 0; i < num_stripes; i++) { | 2757 | for (i = 0; i < num_stripes; i++) { |
2674 | stripe = btrfs_stripe_nr(chunk, i); | 2758 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
2985 | int mixed = 0; | 3069 | int mixed = 0; |
2986 | int ret; | 3070 | int ret; |
2987 | u64 num_devices; | 3071 | u64 num_devices; |
3072 | unsigned seq; | ||
2988 | 3073 | ||
2989 | if (btrfs_fs_closing(fs_info) || | 3074 | if (btrfs_fs_closing(fs_info) || |
2990 | atomic_read(&fs_info->balance_pause_req) || | 3075 | atomic_read(&fs_info->balance_pause_req) || |
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3027 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3112 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
3028 | else | 3113 | else |
3029 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3114 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
3030 | BTRFS_BLOCK_GROUP_RAID10); | 3115 | BTRFS_BLOCK_GROUP_RAID10 | |
3116 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3117 | BTRFS_BLOCK_GROUP_RAID6); | ||
3031 | 3118 | ||
3032 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3119 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3033 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3120 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3067 | 3154 | ||
3068 | /* allow to reduce meta or sys integrity only if force set */ | 3155 | /* allow to reduce meta or sys integrity only if force set */ |
3069 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3156 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
3070 | BTRFS_BLOCK_GROUP_RAID10; | 3157 | BTRFS_BLOCK_GROUP_RAID10 | |
3071 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3158 | BTRFS_BLOCK_GROUP_RAID5 | |
3072 | (fs_info->avail_system_alloc_bits & allowed) && | 3159 | BTRFS_BLOCK_GROUP_RAID6; |
3073 | !(bctl->sys.target & allowed)) || | 3160 | do { |
3074 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3161 | seq = read_seqbegin(&fs_info->profiles_lock); |
3075 | (fs_info->avail_metadata_alloc_bits & allowed) && | 3162 | |
3076 | !(bctl->meta.target & allowed))) { | 3163 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3077 | if (bctl->flags & BTRFS_BALANCE_FORCE) { | 3164 | (fs_info->avail_system_alloc_bits & allowed) && |
3078 | printk(KERN_INFO "btrfs: force reducing metadata " | 3165 | !(bctl->sys.target & allowed)) || |
3079 | "integrity\n"); | 3166 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3080 | } else { | 3167 | (fs_info->avail_metadata_alloc_bits & allowed) && |
3081 | printk(KERN_ERR "btrfs: balance will reduce metadata " | 3168 | !(bctl->meta.target & allowed))) { |
3082 | "integrity, use force if you want this\n"); | 3169 | if (bctl->flags & BTRFS_BALANCE_FORCE) { |
3083 | ret = -EINVAL; | 3170 | printk(KERN_INFO "btrfs: force reducing metadata " |
3084 | goto out; | 3171 | "integrity\n"); |
3172 | } else { | ||
3173 | printk(KERN_ERR "btrfs: balance will reduce metadata " | ||
3174 | "integrity, use force if you want this\n"); | ||
3175 | ret = -EINVAL; | ||
3176 | goto out; | ||
3177 | } | ||
3085 | } | 3178 | } |
3086 | } | 3179 | } while (read_seqretry(&fs_info->profiles_lock, seq)); |
3087 | 3180 | ||
3088 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3181 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
3089 | int num_tolerated_disk_barrier_failures; | 3182 | int num_tolerated_disk_barrier_failures; |
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3127 | mutex_lock(&fs_info->balance_mutex); | 3220 | mutex_lock(&fs_info->balance_mutex); |
3128 | atomic_dec(&fs_info->balance_running); | 3221 | atomic_dec(&fs_info->balance_running); |
3129 | 3222 | ||
3130 | if (bargs) { | ||
3131 | memset(bargs, 0, sizeof(*bargs)); | ||
3132 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
3133 | } | ||
3134 | |||
3135 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
3136 | balance_need_close(fs_info)) { | ||
3137 | __cancel_balance(fs_info); | ||
3138 | } | ||
3139 | |||
3140 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3223 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
3141 | fs_info->num_tolerated_disk_barrier_failures = | 3224 | fs_info->num_tolerated_disk_barrier_failures = |
3142 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); | 3225 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); |
3143 | } | 3226 | } |
3144 | 3227 | ||
3228 | if (bargs) { | ||
3229 | memset(bargs, 0, sizeof(*bargs)); | ||
3230 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
3231 | } | ||
3232 | |||
3145 | wake_up(&fs_info->balance_wait_q); | 3233 | wake_up(&fs_info->balance_wait_q); |
3146 | 3234 | ||
3147 | return ret; | 3235 | return ret; |
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) | |||
3504 | } | 3592 | } |
3505 | 3593 | ||
3506 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | 3594 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
3507 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, | 3595 | [BTRFS_RAID_RAID10] = { |
3508 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, | 3596 | .sub_stripes = 2, |
3509 | { 1, 2, 1, 1, 1, 2 /* dup */ }, | 3597 | .dev_stripes = 1, |
3510 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, | 3598 | .devs_max = 0, /* 0 == as many as possible */ |
3511 | { 1, 1, 1, 1, 1, 1 /* single */ }, | 3599 | .devs_min = 4, |
3600 | .devs_increment = 2, | ||
3601 | .ncopies = 2, | ||
3602 | }, | ||
3603 | [BTRFS_RAID_RAID1] = { | ||
3604 | .sub_stripes = 1, | ||
3605 | .dev_stripes = 1, | ||
3606 | .devs_max = 2, | ||
3607 | .devs_min = 2, | ||
3608 | .devs_increment = 2, | ||
3609 | .ncopies = 2, | ||
3610 | }, | ||
3611 | [BTRFS_RAID_DUP] = { | ||
3612 | .sub_stripes = 1, | ||
3613 | .dev_stripes = 2, | ||
3614 | .devs_max = 1, | ||
3615 | .devs_min = 1, | ||
3616 | .devs_increment = 1, | ||
3617 | .ncopies = 2, | ||
3618 | }, | ||
3619 | [BTRFS_RAID_RAID0] = { | ||
3620 | .sub_stripes = 1, | ||
3621 | .dev_stripes = 1, | ||
3622 | .devs_max = 0, | ||
3623 | .devs_min = 2, | ||
3624 | .devs_increment = 1, | ||
3625 | .ncopies = 1, | ||
3626 | }, | ||
3627 | [BTRFS_RAID_SINGLE] = { | ||
3628 | .sub_stripes = 1, | ||
3629 | .dev_stripes = 1, | ||
3630 | .devs_max = 1, | ||
3631 | .devs_min = 1, | ||
3632 | .devs_increment = 1, | ||
3633 | .ncopies = 1, | ||
3634 | }, | ||
3635 | [BTRFS_RAID_RAID5] = { | ||
3636 | .sub_stripes = 1, | ||
3637 | .dev_stripes = 1, | ||
3638 | .devs_max = 0, | ||
3639 | .devs_min = 2, | ||
3640 | .devs_increment = 1, | ||
3641 | .ncopies = 2, | ||
3642 | }, | ||
3643 | [BTRFS_RAID_RAID6] = { | ||
3644 | .sub_stripes = 1, | ||
3645 | .dev_stripes = 1, | ||
3646 | .devs_max = 0, | ||
3647 | .devs_min = 3, | ||
3648 | .devs_increment = 1, | ||
3649 | .ncopies = 3, | ||
3650 | }, | ||
3512 | }; | 3651 | }; |
3513 | 3652 | ||
3653 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
3654 | { | ||
3655 | /* TODO allow them to set a preferred stripe size */ | ||
3656 | return 64 * 1024; | ||
3657 | } | ||
3658 | |||
3659 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
3660 | { | ||
3661 | u64 features; | ||
3662 | |||
3663 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
3664 | return; | ||
3665 | |||
3666 | features = btrfs_super_incompat_flags(info->super_copy); | ||
3667 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
3668 | return; | ||
3669 | |||
3670 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
3671 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
3672 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
3673 | } | ||
3674 | |||
3514 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3675 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3515 | struct btrfs_root *extent_root, | 3676 | struct btrfs_root *extent_root, |
3516 | struct map_lookup **map_ret, | 3677 | struct map_lookup **map_ret, |
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3526 | struct btrfs_device_info *devices_info = NULL; | 3687 | struct btrfs_device_info *devices_info = NULL; |
3527 | u64 total_avail; | 3688 | u64 total_avail; |
3528 | int num_stripes; /* total number of stripes to allocate */ | 3689 | int num_stripes; /* total number of stripes to allocate */ |
3690 | int data_stripes; /* number of stripes that count for | ||
3691 | block group size */ | ||
3529 | int sub_stripes; /* sub_stripes info for map */ | 3692 | int sub_stripes; /* sub_stripes info for map */ |
3530 | int dev_stripes; /* stripes per dev */ | 3693 | int dev_stripes; /* stripes per dev */ |
3531 | int devs_max; /* max devs to use */ | 3694 | int devs_max; /* max devs to use */ |
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3537 | u64 max_chunk_size; | 3700 | u64 max_chunk_size; |
3538 | u64 stripe_size; | 3701 | u64 stripe_size; |
3539 | u64 num_bytes; | 3702 | u64 num_bytes; |
3703 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
3540 | int ndevs; | 3704 | int ndevs; |
3541 | int i; | 3705 | int i; |
3542 | int j; | 3706 | int j; |
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3631 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) | 3795 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) |
3632 | continue; | 3796 | continue; |
3633 | 3797 | ||
3798 | if (ndevs == fs_devices->rw_devices) { | ||
3799 | WARN(1, "%s: found more than %llu devices\n", | ||
3800 | __func__, fs_devices->rw_devices); | ||
3801 | break; | ||
3802 | } | ||
3634 | devices_info[ndevs].dev_offset = dev_offset; | 3803 | devices_info[ndevs].dev_offset = dev_offset; |
3635 | devices_info[ndevs].max_avail = max_avail; | 3804 | devices_info[ndevs].max_avail = max_avail; |
3636 | devices_info[ndevs].total_avail = total_avail; | 3805 | devices_info[ndevs].total_avail = total_avail; |
3637 | devices_info[ndevs].dev = device; | 3806 | devices_info[ndevs].dev = device; |
3638 | ++ndevs; | 3807 | ++ndevs; |
3639 | WARN_ON(ndevs > fs_devices->rw_devices); | ||
3640 | } | 3808 | } |
3641 | 3809 | ||
3642 | /* | 3810 | /* |
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3662 | stripe_size = devices_info[ndevs-1].max_avail; | 3830 | stripe_size = devices_info[ndevs-1].max_avail; |
3663 | num_stripes = ndevs * dev_stripes; | 3831 | num_stripes = ndevs * dev_stripes; |
3664 | 3832 | ||
3665 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3833 | /* |
3666 | stripe_size = max_chunk_size * ncopies; | 3834 | * this will have to be fixed for RAID1 and RAID10 over |
3667 | do_div(stripe_size, ndevs); | 3835 | * more drives |
3836 | */ | ||
3837 | data_stripes = num_stripes / ncopies; | ||
3838 | |||
3839 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | ||
3840 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
3841 | btrfs_super_stripesize(info->super_copy)); | ||
3842 | data_stripes = num_stripes - 1; | ||
3843 | } | ||
3844 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
3845 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
3846 | btrfs_super_stripesize(info->super_copy)); | ||
3847 | data_stripes = num_stripes - 2; | ||
3848 | } | ||
3849 | |||
3850 | /* | ||
3851 | * Use the number of data stripes to figure out how big this chunk | ||
3852 | * is really going to be in terms of logical address space, | ||
3853 | * and compare that answer with the max chunk size | ||
3854 | */ | ||
3855 | if (stripe_size * data_stripes > max_chunk_size) { | ||
3856 | u64 mask = (1ULL << 24) - 1; | ||
3857 | stripe_size = max_chunk_size; | ||
3858 | do_div(stripe_size, data_stripes); | ||
3859 | |||
3860 | /* bump the answer up to a 16MB boundary */ | ||
3861 | stripe_size = (stripe_size + mask) & ~mask; | ||
3862 | |||
3863 | /* but don't go higher than the limits we found | ||
3864 | * while searching for free extents | ||
3865 | */ | ||
3866 | if (stripe_size > devices_info[ndevs-1].max_avail) | ||
3867 | stripe_size = devices_info[ndevs-1].max_avail; | ||
3668 | } | 3868 | } |
3669 | 3869 | ||
3670 | do_div(stripe_size, dev_stripes); | 3870 | do_div(stripe_size, dev_stripes); |
3671 | 3871 | ||
3672 | /* align to BTRFS_STRIPE_LEN */ | 3872 | /* align to BTRFS_STRIPE_LEN */ |
3673 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3873 | do_div(stripe_size, raid_stripe_len); |
3674 | stripe_size *= BTRFS_STRIPE_LEN; | 3874 | stripe_size *= raid_stripe_len; |
3675 | 3875 | ||
3676 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3876 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
3677 | if (!map) { | 3877 | if (!map) { |
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3689 | } | 3889 | } |
3690 | } | 3890 | } |
3691 | map->sector_size = extent_root->sectorsize; | 3891 | map->sector_size = extent_root->sectorsize; |
3692 | map->stripe_len = BTRFS_STRIPE_LEN; | 3892 | map->stripe_len = raid_stripe_len; |
3693 | map->io_align = BTRFS_STRIPE_LEN; | 3893 | map->io_align = raid_stripe_len; |
3694 | map->io_width = BTRFS_STRIPE_LEN; | 3894 | map->io_width = raid_stripe_len; |
3695 | map->type = type; | 3895 | map->type = type; |
3696 | map->sub_stripes = sub_stripes; | 3896 | map->sub_stripes = sub_stripes; |
3697 | 3897 | ||
3698 | *map_ret = map; | 3898 | *map_ret = map; |
3699 | num_bytes = stripe_size * (num_stripes / ncopies); | 3899 | num_bytes = stripe_size * data_stripes; |
3700 | 3900 | ||
3701 | *stripe_size_out = stripe_size; | 3901 | *stripe_size_out = stripe_size; |
3702 | *num_bytes_out = num_bytes; | 3902 | *num_bytes_out = num_bytes; |
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3718 | write_lock(&em_tree->lock); | 3918 | write_lock(&em_tree->lock); |
3719 | ret = add_extent_mapping(em_tree, em); | 3919 | ret = add_extent_mapping(em_tree, em); |
3720 | write_unlock(&em_tree->lock); | 3920 | write_unlock(&em_tree->lock); |
3721 | free_extent_map(em); | 3921 | if (ret) { |
3722 | if (ret) | 3922 | free_extent_map(em); |
3723 | goto error; | ||
3724 | |||
3725 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
3726 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
3727 | start, num_bytes); | ||
3728 | if (ret) | ||
3729 | goto error; | 3923 | goto error; |
3924 | } | ||
3730 | 3925 | ||
3731 | for (i = 0; i < map->num_stripes; ++i) { | 3926 | for (i = 0; i < map->num_stripes; ++i) { |
3732 | struct btrfs_device *device; | 3927 | struct btrfs_device *device; |
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3739 | info->chunk_root->root_key.objectid, | 3934 | info->chunk_root->root_key.objectid, |
3740 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | 3935 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, |
3741 | start, dev_offset, stripe_size); | 3936 | start, dev_offset, stripe_size); |
3742 | if (ret) { | 3937 | if (ret) |
3743 | btrfs_abort_transaction(trans, extent_root, ret); | 3938 | goto error_dev_extent; |
3744 | goto error; | 3939 | } |
3745 | } | 3940 | |
3941 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
3942 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
3943 | start, num_bytes); | ||
3944 | if (ret) { | ||
3945 | i = map->num_stripes - 1; | ||
3946 | goto error_dev_extent; | ||
3746 | } | 3947 | } |
3747 | 3948 | ||
3949 | free_extent_map(em); | ||
3950 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
3951 | |||
3748 | kfree(devices_info); | 3952 | kfree(devices_info); |
3749 | return 0; | 3953 | return 0; |
3750 | 3954 | ||
3955 | error_dev_extent: | ||
3956 | for (; i >= 0; i--) { | ||
3957 | struct btrfs_device *device; | ||
3958 | int err; | ||
3959 | |||
3960 | device = map->stripes[i].dev; | ||
3961 | err = btrfs_free_dev_extent(trans, device, start); | ||
3962 | if (err) { | ||
3963 | btrfs_abort_transaction(trans, extent_root, err); | ||
3964 | break; | ||
3965 | } | ||
3966 | } | ||
3967 | write_lock(&em_tree->lock); | ||
3968 | remove_extent_mapping(em_tree, em); | ||
3969 | write_unlock(&em_tree->lock); | ||
3970 | |||
3971 | /* One for our allocation */ | ||
3972 | free_extent_map(em); | ||
3973 | /* One for the tree reference */ | ||
3974 | free_extent_map(em); | ||
3751 | error: | 3975 | error: |
3752 | kfree(map); | 3976 | kfree(map); |
3753 | kfree(devices_info); | 3977 | kfree(devices_info); |
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
3887 | if (ret) | 4111 | if (ret) |
3888 | return ret; | 4112 | return ret; |
3889 | 4113 | ||
3890 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | 4114 | alloc_profile = btrfs_get_alloc_profile(extent_root, 0); |
3891 | fs_info->avail_metadata_alloc_bits; | ||
3892 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
3893 | |||
3894 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | 4115 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, |
3895 | &stripe_size, chunk_offset, alloc_profile); | 4116 | &stripe_size, chunk_offset, alloc_profile); |
3896 | if (ret) | 4117 | if (ret) |
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
3898 | 4119 | ||
3899 | sys_chunk_offset = chunk_offset + chunk_size; | 4120 | sys_chunk_offset = chunk_offset + chunk_size; |
3900 | 4121 | ||
3901 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | | 4122 | alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); |
3902 | fs_info->avail_system_alloc_bits; | ||
3903 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
3904 | |||
3905 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, | 4123 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, |
3906 | &sys_chunk_size, &sys_stripe_size, | 4124 | &sys_chunk_size, &sys_stripe_size, |
3907 | sys_chunk_offset, alloc_profile); | 4125 | sys_chunk_offset, alloc_profile); |
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4014 | ret = map->num_stripes; | 4232 | ret = map->num_stripes; |
4015 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4233 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4016 | ret = map->sub_stripes; | 4234 | ret = map->sub_stripes; |
4235 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
4236 | ret = 2; | ||
4237 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4238 | ret = 3; | ||
4017 | else | 4239 | else |
4018 | ret = 1; | 4240 | ret = 1; |
4019 | free_extent_map(em); | 4241 | free_extent_map(em); |
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4026 | return ret; | 4248 | return ret; |
4027 | } | 4249 | } |
4028 | 4250 | ||
4251 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
4252 | struct btrfs_mapping_tree *map_tree, | ||
4253 | u64 logical) | ||
4254 | { | ||
4255 | struct extent_map *em; | ||
4256 | struct map_lookup *map; | ||
4257 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4258 | unsigned long len = root->sectorsize; | ||
4259 | |||
4260 | read_lock(&em_tree->lock); | ||
4261 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4262 | read_unlock(&em_tree->lock); | ||
4263 | BUG_ON(!em); | ||
4264 | |||
4265 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4266 | map = (struct map_lookup *)em->bdev; | ||
4267 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4268 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4269 | len = map->stripe_len * nr_data_stripes(map); | ||
4270 | } | ||
4271 | free_extent_map(em); | ||
4272 | return len; | ||
4273 | } | ||
4274 | |||
4275 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
4276 | u64 logical, u64 len, int mirror_num) | ||
4277 | { | ||
4278 | struct extent_map *em; | ||
4279 | struct map_lookup *map; | ||
4280 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4281 | int ret = 0; | ||
4282 | |||
4283 | read_lock(&em_tree->lock); | ||
4284 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4285 | read_unlock(&em_tree->lock); | ||
4286 | BUG_ON(!em); | ||
4287 | |||
4288 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4289 | map = (struct map_lookup *)em->bdev; | ||
4290 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4291 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4292 | ret = 1; | ||
4293 | free_extent_map(em); | ||
4294 | return ret; | ||
4295 | } | ||
4296 | |||
4029 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4297 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
4030 | struct map_lookup *map, int first, int num, | 4298 | struct map_lookup *map, int first, int num, |
4031 | int optimal, int dev_replace_is_ongoing) | 4299 | int optimal, int dev_replace_is_ongoing) |
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
4063 | return optimal; | 4331 | return optimal; |
4064 | } | 4332 | } |
4065 | 4333 | ||
4334 | static inline int parity_smaller(u64 a, u64 b) | ||
4335 | { | ||
4336 | return a > b; | ||
4337 | } | ||
4338 | |||
4339 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
4340 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
4341 | { | ||
4342 | struct btrfs_bio_stripe s; | ||
4343 | int i; | ||
4344 | u64 l; | ||
4345 | int again = 1; | ||
4346 | |||
4347 | while (again) { | ||
4348 | again = 0; | ||
4349 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
4350 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
4351 | s = bbio->stripes[i]; | ||
4352 | l = raid_map[i]; | ||
4353 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
4354 | raid_map[i] = raid_map[i+1]; | ||
4355 | bbio->stripes[i+1] = s; | ||
4356 | raid_map[i+1] = l; | ||
4357 | again = 1; | ||
4358 | } | ||
4359 | } | ||
4360 | } | ||
4361 | } | ||
4362 | |||
4066 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4363 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4067 | u64 logical, u64 *length, | 4364 | u64 logical, u64 *length, |
4068 | struct btrfs_bio **bbio_ret, | 4365 | struct btrfs_bio **bbio_ret, |
4069 | int mirror_num) | 4366 | int mirror_num, u64 **raid_map_ret) |
4070 | { | 4367 | { |
4071 | struct extent_map *em; | 4368 | struct extent_map *em; |
4072 | struct map_lookup *map; | 4369 | struct map_lookup *map; |
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4078 | u64 stripe_nr; | 4375 | u64 stripe_nr; |
4079 | u64 stripe_nr_orig; | 4376 | u64 stripe_nr_orig; |
4080 | u64 stripe_nr_end; | 4377 | u64 stripe_nr_end; |
4378 | u64 stripe_len; | ||
4379 | u64 *raid_map = NULL; | ||
4081 | int stripe_index; | 4380 | int stripe_index; |
4082 | int i; | 4381 | int i; |
4083 | int ret = 0; | 4382 | int ret = 0; |
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4089 | int num_alloc_stripes; | 4388 | int num_alloc_stripes; |
4090 | int patch_the_first_stripe_for_dev_replace = 0; | 4389 | int patch_the_first_stripe_for_dev_replace = 0; |
4091 | u64 physical_to_patch_in_first_stripe = 0; | 4390 | u64 physical_to_patch_in_first_stripe = 0; |
4391 | u64 raid56_full_stripe_start = (u64)-1; | ||
4092 | 4392 | ||
4093 | read_lock(&em_tree->lock); | 4393 | read_lock(&em_tree->lock); |
4094 | em = lookup_extent_mapping(em_tree, logical, *length); | 4394 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4105 | map = (struct map_lookup *)em->bdev; | 4405 | map = (struct map_lookup *)em->bdev; |
4106 | offset = logical - em->start; | 4406 | offset = logical - em->start; |
4107 | 4407 | ||
4408 | if (mirror_num > map->num_stripes) | ||
4409 | mirror_num = 0; | ||
4410 | |||
4411 | stripe_len = map->stripe_len; | ||
4108 | stripe_nr = offset; | 4412 | stripe_nr = offset; |
4109 | /* | 4413 | /* |
4110 | * stripe_nr counts the total number of stripes we have to stride | 4414 | * stripe_nr counts the total number of stripes we have to stride |
4111 | * to get to this block | 4415 | * to get to this block |
4112 | */ | 4416 | */ |
4113 | do_div(stripe_nr, map->stripe_len); | 4417 | do_div(stripe_nr, stripe_len); |
4114 | 4418 | ||
4115 | stripe_offset = stripe_nr * map->stripe_len; | 4419 | stripe_offset = stripe_nr * stripe_len; |
4116 | BUG_ON(offset < stripe_offset); | 4420 | BUG_ON(offset < stripe_offset); |
4117 | 4421 | ||
4118 | /* stripe_offset is the offset of this block in its stripe*/ | 4422 | /* stripe_offset is the offset of this block in its stripe*/ |
4119 | stripe_offset = offset - stripe_offset; | 4423 | stripe_offset = offset - stripe_offset; |
4120 | 4424 | ||
4121 | if (rw & REQ_DISCARD) | 4425 | /* if we're here for raid56, we need to know the stripe aligned start */ |
4426 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4427 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
4428 | raid56_full_stripe_start = offset; | ||
4429 | |||
4430 | /* allow a write of a full stripe, but make sure we don't | ||
4431 | * allow straddling of stripes | ||
4432 | */ | ||
4433 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
4434 | raid56_full_stripe_start *= full_stripe_len; | ||
4435 | } | ||
4436 | |||
4437 | if (rw & REQ_DISCARD) { | ||
4438 | /* we don't discard raid56 yet */ | ||
4439 | if (map->type & | ||
4440 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4441 | ret = -EOPNOTSUPP; | ||
4442 | goto out; | ||
4443 | } | ||
4122 | *length = min_t(u64, em->len - offset, *length); | 4444 | *length = min_t(u64, em->len - offset, *length); |
4123 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4445 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
4124 | /* we limit the length of each bio to what fits in a stripe */ | 4446 | u64 max_len; |
4125 | *length = min_t(u64, em->len - offset, | 4447 | /* For writes to RAID[56], allow a full stripeset across all disks. |
4126 | map->stripe_len - stripe_offset); | 4448 | For other RAID types and for RAID[56] reads, just allow a single |
4449 | stripe (on a single disk). */ | ||
4450 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
4451 | (rw & REQ_WRITE)) { | ||
4452 | max_len = stripe_len * nr_data_stripes(map) - | ||
4453 | (offset - raid56_full_stripe_start); | ||
4454 | } else { | ||
4455 | /* we limit the length of each bio to what fits in a stripe */ | ||
4456 | max_len = stripe_len - stripe_offset; | ||
4457 | } | ||
4458 | *length = min_t(u64, em->len - offset, max_len); | ||
4127 | } else { | 4459 | } else { |
4128 | *length = em->len - offset; | 4460 | *length = em->len - offset; |
4129 | } | 4461 | } |
4130 | 4462 | ||
4463 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
4464 | it cares about is the length */ | ||
4131 | if (!bbio_ret) | 4465 | if (!bbio_ret) |
4132 | goto out; | 4466 | goto out; |
4133 | 4467 | ||
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4160 | u64 physical_of_found = 0; | 4494 | u64 physical_of_found = 0; |
4161 | 4495 | ||
4162 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4496 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
4163 | logical, &tmp_length, &tmp_bbio, 0); | 4497 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
4164 | if (ret) { | 4498 | if (ret) { |
4165 | WARN_ON(tmp_bbio != NULL); | 4499 | WARN_ON(tmp_bbio != NULL); |
4166 | goto out; | 4500 | goto out; |
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4221 | num_stripes = 1; | 4555 | num_stripes = 1; |
4222 | stripe_index = 0; | 4556 | stripe_index = 0; |
4223 | stripe_nr_orig = stripe_nr; | 4557 | stripe_nr_orig = stripe_nr; |
4224 | stripe_nr_end = (offset + *length + map->stripe_len - 1) & | 4558 | stripe_nr_end = ALIGN(offset + *length, map->stripe_len); |
4225 | (~(map->stripe_len - 1)); | ||
4226 | do_div(stripe_nr_end, map->stripe_len); | 4559 | do_div(stripe_nr_end, map->stripe_len); |
4227 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4560 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
4228 | (offset + *length); | 4561 | (offset + *length); |
4562 | |||
4229 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4563 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4230 | if (rw & REQ_DISCARD) | 4564 | if (rw & REQ_DISCARD) |
4231 | num_stripes = min_t(u64, map->num_stripes, | 4565 | num_stripes = min_t(u64, map->num_stripes, |
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4276 | dev_replace_is_ongoing); | 4610 | dev_replace_is_ongoing); |
4277 | mirror_num = stripe_index - old_stripe_index + 1; | 4611 | mirror_num = stripe_index - old_stripe_index + 1; |
4278 | } | 4612 | } |
4613 | |||
4614 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4615 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4616 | u64 tmp; | ||
4617 | |||
4618 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
4619 | && raid_map_ret) { | ||
4620 | int i, rot; | ||
4621 | |||
4622 | /* push stripe_nr back to the start of the full stripe */ | ||
4623 | stripe_nr = raid56_full_stripe_start; | ||
4624 | do_div(stripe_nr, stripe_len); | ||
4625 | |||
4626 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4627 | |||
4628 | /* RAID[56] write or recovery. Return all stripes */ | ||
4629 | num_stripes = map->num_stripes; | ||
4630 | max_errors = nr_parity_stripes(map); | ||
4631 | |||
4632 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
4633 | GFP_NOFS); | ||
4634 | if (!raid_map) { | ||
4635 | ret = -ENOMEM; | ||
4636 | goto out; | ||
4637 | } | ||
4638 | |||
4639 | /* Work out the disk rotation on this stripe-set */ | ||
4640 | tmp = stripe_nr; | ||
4641 | rot = do_div(tmp, num_stripes); | ||
4642 | |||
4643 | /* Fill in the logical address of each stripe */ | ||
4644 | tmp = stripe_nr * nr_data_stripes(map); | ||
4645 | for (i = 0; i < nr_data_stripes(map); i++) | ||
4646 | raid_map[(i+rot) % num_stripes] = | ||
4647 | em->start + (tmp + i) * map->stripe_len; | ||
4648 | |||
4649 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
4650 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4651 | raid_map[(i+rot+1) % num_stripes] = | ||
4652 | RAID6_Q_STRIPE; | ||
4653 | |||
4654 | *length = map->stripe_len; | ||
4655 | stripe_index = 0; | ||
4656 | stripe_offset = 0; | ||
4657 | } else { | ||
4658 | /* | ||
4659 | * Mirror #0 or #1 means the original data block. | ||
4660 | * Mirror #2 is RAID5 parity block. | ||
4661 | * Mirror #3 is RAID6 Q block. | ||
4662 | */ | ||
4663 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4664 | if (mirror_num > 1) | ||
4665 | stripe_index = nr_data_stripes(map) + | ||
4666 | mirror_num - 2; | ||
4667 | |||
4668 | /* We distribute the parity blocks across stripes */ | ||
4669 | tmp = stripe_nr + stripe_index; | ||
4670 | stripe_index = do_div(tmp, map->num_stripes); | ||
4671 | } | ||
4279 | } else { | 4672 | } else { |
4280 | /* | 4673 | /* |
4281 | * after this do_div call, stripe_nr is the number of stripes | 4674 | * after this do_div call, stripe_nr is the number of stripes |
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4384 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4777 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4385 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4778 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4386 | BTRFS_BLOCK_GROUP_RAID10 | | 4779 | BTRFS_BLOCK_GROUP_RAID10 | |
4780 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4387 | BTRFS_BLOCK_GROUP_DUP)) { | 4781 | BTRFS_BLOCK_GROUP_DUP)) { |
4388 | max_errors = 1; | 4782 | max_errors = 1; |
4783 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
4784 | max_errors = 2; | ||
4389 | } | 4785 | } |
4390 | } | 4786 | } |
4391 | 4787 | ||
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4486 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4882 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
4487 | bbio->mirror_num = map->num_stripes + 1; | 4883 | bbio->mirror_num = map->num_stripes + 1; |
4488 | } | 4884 | } |
4885 | if (raid_map) { | ||
4886 | sort_parity_stripes(bbio, raid_map); | ||
4887 | *raid_map_ret = raid_map; | ||
4888 | } | ||
4489 | out: | 4889 | out: |
4490 | if (dev_replace_is_ongoing) | 4890 | if (dev_replace_is_ongoing) |
4491 | btrfs_dev_replace_unlock(dev_replace); | 4891 | btrfs_dev_replace_unlock(dev_replace); |
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4498 | struct btrfs_bio **bbio_ret, int mirror_num) | 4898 | struct btrfs_bio **bbio_ret, int mirror_num) |
4499 | { | 4899 | { |
4500 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4900 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4501 | mirror_num); | 4901 | mirror_num, NULL); |
4502 | } | 4902 | } |
4503 | 4903 | ||
4504 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4904 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4512 | u64 bytenr; | 4912 | u64 bytenr; |
4513 | u64 length; | 4913 | u64 length; |
4514 | u64 stripe_nr; | 4914 | u64 stripe_nr; |
4915 | u64 rmap_len; | ||
4515 | int i, j, nr = 0; | 4916 | int i, j, nr = 0; |
4516 | 4917 | ||
4517 | read_lock(&em_tree->lock); | 4918 | read_lock(&em_tree->lock); |
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4522 | map = (struct map_lookup *)em->bdev; | 4923 | map = (struct map_lookup *)em->bdev; |
4523 | 4924 | ||
4524 | length = em->len; | 4925 | length = em->len; |
4926 | rmap_len = map->stripe_len; | ||
4927 | |||
4525 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4928 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4526 | do_div(length, map->num_stripes / map->sub_stripes); | 4929 | do_div(length, map->num_stripes / map->sub_stripes); |
4527 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4930 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
4528 | do_div(length, map->num_stripes); | 4931 | do_div(length, map->num_stripes); |
4932 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4933 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4934 | do_div(length, nr_data_stripes(map)); | ||
4935 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
4936 | } | ||
4529 | 4937 | ||
4530 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4938 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
4531 | BUG_ON(!buf); /* -ENOMEM */ | 4939 | BUG_ON(!buf); /* -ENOMEM */ |
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4545 | do_div(stripe_nr, map->sub_stripes); | 4953 | do_div(stripe_nr, map->sub_stripes); |
4546 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4954 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4547 | stripe_nr = stripe_nr * map->num_stripes + i; | 4955 | stripe_nr = stripe_nr * map->num_stripes + i; |
4548 | } | 4956 | } /* else if RAID[56], multiply by nr_data_stripes(). |
4549 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4957 | * Alternatively, just use rmap_len below instead of |
4958 | * map->stripe_len */ | ||
4959 | |||
4960 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
4550 | WARN_ON(nr >= map->num_stripes); | 4961 | WARN_ON(nr >= map->num_stripes); |
4551 | for (j = 0; j < nr; j++) { | 4962 | for (j = 0; j < nr; j++) { |
4552 | if (buf[j] == bytenr) | 4963 | if (buf[j] == bytenr) |
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4560 | 4971 | ||
4561 | *logical = buf; | 4972 | *logical = buf; |
4562 | *naddrs = nr; | 4973 | *naddrs = nr; |
4563 | *stripe_len = map->stripe_len; | 4974 | *stripe_len = rmap_len; |
4564 | 4975 | ||
4565 | free_extent_map(em); | 4976 | free_extent_map(em); |
4566 | return 0; | 4977 | return 0; |
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
4634 | bio->bi_bdev = (struct block_device *) | 5045 | bio->bi_bdev = (struct block_device *) |
4635 | (unsigned long)bbio->mirror_num; | 5046 | (unsigned long)bbio->mirror_num; |
4636 | /* only send an error to the higher layers if it is | 5047 | /* only send an error to the higher layers if it is |
4637 | * beyond the tolerance of the multi-bio | 5048 | * beyond the tolerance of the btrfs bio |
4638 | */ | 5049 | */ |
4639 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 5050 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
4640 | err = -EIO; | 5051 | err = -EIO; |
@@ -4668,13 +5079,18 @@ struct async_sched { | |||
4668 | * This will add one bio to the pending list for a device and make sure | 5079 | * This will add one bio to the pending list for a device and make sure |
4669 | * the work struct is scheduled. | 5080 | * the work struct is scheduled. |
4670 | */ | 5081 | */ |
4671 | static noinline void schedule_bio(struct btrfs_root *root, | 5082 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
4672 | struct btrfs_device *device, | 5083 | struct btrfs_device *device, |
4673 | int rw, struct bio *bio) | 5084 | int rw, struct bio *bio) |
4674 | { | 5085 | { |
4675 | int should_queue = 1; | 5086 | int should_queue = 1; |
4676 | struct btrfs_pending_bios *pending_bios; | 5087 | struct btrfs_pending_bios *pending_bios; |
4677 | 5088 | ||
5089 | if (device->missing || !device->bdev) { | ||
5090 | bio_endio(bio, -EIO); | ||
5091 | return; | ||
5092 | } | ||
5093 | |||
4678 | /* don't bother with additional async steps for reads, right now */ | 5094 | /* don't bother with additional async steps for reads, right now */ |
4679 | if (!(rw & REQ_WRITE)) { | 5095 | if (!(rw & REQ_WRITE)) { |
4680 | bio_get(bio); | 5096 | bio_get(bio); |
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
4772 | #endif | 5188 | #endif |
4773 | bio->bi_bdev = dev->bdev; | 5189 | bio->bi_bdev = dev->bdev; |
4774 | if (async) | 5190 | if (async) |
4775 | schedule_bio(root, dev, rw, bio); | 5191 | btrfs_schedule_bio(root, dev, rw, bio); |
4776 | else | 5192 | else |
4777 | btrfsic_submit_bio(rw, bio); | 5193 | btrfsic_submit_bio(rw, bio); |
4778 | } | 5194 | } |
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4831 | u64 logical = (u64)bio->bi_sector << 9; | 5247 | u64 logical = (u64)bio->bi_sector << 9; |
4832 | u64 length = 0; | 5248 | u64 length = 0; |
4833 | u64 map_length; | 5249 | u64 map_length; |
5250 | u64 *raid_map = NULL; | ||
4834 | int ret; | 5251 | int ret; |
4835 | int dev_nr = 0; | 5252 | int dev_nr = 0; |
4836 | int total_devs = 1; | 5253 | int total_devs = 1; |
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4839 | length = bio->bi_size; | 5256 | length = bio->bi_size; |
4840 | map_length = length; | 5257 | map_length = length; |
4841 | 5258 | ||
4842 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5259 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4843 | mirror_num); | 5260 | mirror_num, &raid_map); |
4844 | if (ret) | 5261 | if (ret) /* -ENOMEM */ |
4845 | return ret; | 5262 | return ret; |
4846 | 5263 | ||
4847 | total_devs = bbio->num_stripes; | 5264 | total_devs = bbio->num_stripes; |
5265 | bbio->orig_bio = first_bio; | ||
5266 | bbio->private = first_bio->bi_private; | ||
5267 | bbio->end_io = first_bio->bi_end_io; | ||
5268 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
5269 | |||
5270 | if (raid_map) { | ||
5271 | /* In this case, map_length has been set to the length of | ||
5272 | a single stripe; not the whole write */ | ||
5273 | if (rw & WRITE) { | ||
5274 | return raid56_parity_write(root, bio, bbio, | ||
5275 | raid_map, map_length); | ||
5276 | } else { | ||
5277 | return raid56_parity_recover(root, bio, bbio, | ||
5278 | raid_map, map_length, | ||
5279 | mirror_num); | ||
5280 | } | ||
5281 | } | ||
5282 | |||
4848 | if (map_length < length) { | 5283 | if (map_length < length) { |
4849 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5284 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
4850 | "len %llu\n", (unsigned long long)logical, | 5285 | "len %llu\n", (unsigned long long)logical, |
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4853 | BUG(); | 5288 | BUG(); |
4854 | } | 5289 | } |
4855 | 5290 | ||
4856 | bbio->orig_bio = first_bio; | ||
4857 | bbio->private = first_bio->bi_private; | ||
4858 | bbio->end_io = first_bio->bi_end_io; | ||
4859 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
4860 | |||
4861 | while (dev_nr < total_devs) { | 5291 | while (dev_nr < total_devs) { |
4862 | dev = bbio->stripes[dev_nr].dev; | 5292 | dev = bbio->stripes[dev_nr].dev; |
4863 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5293 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |