diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/Kconfig | 2 | ||||
-rw-r--r-- | fs/btrfs/Makefile | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 35 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 62 | ||||
-rw-r--r-- | fs/btrfs/disk-io.h | 7 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 88 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 18 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 50 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 18 | ||||
-rw-r--r-- | fs/btrfs/raid56.c | 1647 | ||||
-rw-r--r-- | fs/btrfs/raid56.h | 51 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 8 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 3 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 385 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 9 |
15 files changed, 2283 insertions, 102 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c08b60..4f5dc93fa2f8 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
@@ -6,6 +6,8 @@ config BTRFS_FS | |||
6 | select ZLIB_DEFLATE | 6 | select ZLIB_DEFLATE |
7 | select LZO_COMPRESS | 7 | select LZO_COMPRESS |
8 | select LZO_DECOMPRESS | 8 | select LZO_DECOMPRESS |
9 | select RAID6_PQ | ||
10 | |||
9 | help | 11 | help |
10 | Btrfs is a new filesystem with extents, writable snapshotting, | 12 | Btrfs is a new filesystem with extents, writable snapshotting, |
11 | support for multiple devices and many more features. | 13 | support for multiple devices and many more features. |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o | 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o |
12 | 12 | ||
13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0ab51be6879f..0cce3aafbd62 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -502,6 +502,7 @@ struct btrfs_super_block { | |||
502 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) | 502 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) |
503 | 503 | ||
504 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) | 504 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) |
505 | #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) | ||
505 | 506 | ||
506 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL | 507 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
507 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL | 508 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL |
@@ -511,6 +512,7 @@ struct btrfs_super_block { | |||
511 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ | 512 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
512 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ | 513 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
513 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ | 514 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
515 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ | ||
514 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) | 516 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
515 | 517 | ||
516 | /* | 518 | /* |
@@ -952,8 +954,10 @@ struct btrfs_dev_replace_item { | |||
952 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) | 954 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) |
953 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) | 955 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) |
954 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) | 956 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) |
957 | #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) | ||
958 | #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) | ||
955 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE | 959 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE |
956 | #define BTRFS_NR_RAID_TYPES 5 | 960 | #define BTRFS_NR_RAID_TYPES 7 |
957 | 961 | ||
958 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ | 962 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ |
959 | BTRFS_BLOCK_GROUP_SYSTEM | \ | 963 | BTRFS_BLOCK_GROUP_SYSTEM | \ |
@@ -961,6 +965,8 @@ struct btrfs_dev_replace_item { | |||
961 | 965 | ||
962 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ | 966 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ |
963 | BTRFS_BLOCK_GROUP_RAID1 | \ | 967 | BTRFS_BLOCK_GROUP_RAID1 | \ |
968 | BTRFS_BLOCK_GROUP_RAID5 | \ | ||
969 | BTRFS_BLOCK_GROUP_RAID6 | \ | ||
964 | BTRFS_BLOCK_GROUP_DUP | \ | 970 | BTRFS_BLOCK_GROUP_DUP | \ |
965 | BTRFS_BLOCK_GROUP_RAID10) | 971 | BTRFS_BLOCK_GROUP_RAID10) |
966 | /* | 972 | /* |
@@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache { | |||
1185 | u64 flags; | 1191 | u64 flags; |
1186 | u64 sectorsize; | 1192 | u64 sectorsize; |
1187 | u64 cache_generation; | 1193 | u64 cache_generation; |
1194 | |||
1195 | /* for raid56, this is a full stripe, without parity */ | ||
1196 | unsigned long full_stripe_len; | ||
1197 | |||
1188 | unsigned int ro:1; | 1198 | unsigned int ro:1; |
1189 | unsigned int dirty:1; | 1199 | unsigned int dirty:1; |
1190 | unsigned int iref:1; | 1200 | unsigned int iref:1; |
@@ -1225,6 +1235,20 @@ struct seq_list { | |||
1225 | u64 seq; | 1235 | u64 seq; |
1226 | }; | 1236 | }; |
1227 | 1237 | ||
1238 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1239 | struct btrfs_stripe_hash { | ||
1240 | struct list_head hash_list; | ||
1241 | wait_queue_head_t wait; | ||
1242 | spinlock_t lock; | ||
1243 | }; | ||
1244 | |||
1245 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1246 | struct btrfs_stripe_hash_table { | ||
1247 | struct btrfs_stripe_hash *table; | ||
1248 | }; | ||
1249 | |||
1250 | #define BTRFS_STRIPE_HASH_TABLE_BITS 11 | ||
1251 | |||
1228 | /* fs_info */ | 1252 | /* fs_info */ |
1229 | struct reloc_control; | 1253 | struct reloc_control; |
1230 | struct btrfs_device; | 1254 | struct btrfs_device; |
@@ -1307,6 +1331,13 @@ struct btrfs_fs_info { | |||
1307 | struct mutex cleaner_mutex; | 1331 | struct mutex cleaner_mutex; |
1308 | struct mutex chunk_mutex; | 1332 | struct mutex chunk_mutex; |
1309 | struct mutex volume_mutex; | 1333 | struct mutex volume_mutex; |
1334 | |||
1335 | /* this is used during read/modify/write to make sure | ||
1336 | * no two ios are trying to mod the same stripe at the same | ||
1337 | * time | ||
1338 | */ | ||
1339 | struct btrfs_stripe_hash_table *stripe_hash_table; | ||
1340 | |||
1310 | /* | 1341 | /* |
1311 | * this protects the ordered operations list only while we are | 1342 | * this protects the ordered operations list only while we are |
1312 | * processing all of the entries on it. This way we make | 1343 | * processing all of the entries on it. This way we make |
@@ -1395,6 +1426,8 @@ struct btrfs_fs_info { | |||
1395 | struct btrfs_workers flush_workers; | 1426 | struct btrfs_workers flush_workers; |
1396 | struct btrfs_workers endio_workers; | 1427 | struct btrfs_workers endio_workers; |
1397 | struct btrfs_workers endio_meta_workers; | 1428 | struct btrfs_workers endio_meta_workers; |
1429 | struct btrfs_workers endio_raid56_workers; | ||
1430 | struct btrfs_workers rmw_workers; | ||
1398 | struct btrfs_workers endio_meta_write_workers; | 1431 | struct btrfs_workers endio_meta_write_workers; |
1399 | struct btrfs_workers endio_write_workers; | 1432 | struct btrfs_workers endio_write_workers; |
1400 | struct btrfs_workers endio_freespace_worker; | 1433 | struct btrfs_workers endio_freespace_worker; |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65f03670a952..e9fa7b4d18e3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include "check-integrity.h" | 46 | #include "check-integrity.h" |
47 | #include "rcu-string.h" | 47 | #include "rcu-string.h" |
48 | #include "dev-replace.h" | 48 | #include "dev-replace.h" |
49 | #include "raid56.h" | ||
49 | 50 | ||
50 | #ifdef CONFIG_X86 | 51 | #ifdef CONFIG_X86 |
51 | #include <asm/cpufeature.h> | 52 | #include <asm/cpufeature.h> |
@@ -639,8 +640,15 @@ err: | |||
639 | btree_readahead_hook(root, eb, eb->start, ret); | 640 | btree_readahead_hook(root, eb, eb->start, ret); |
640 | } | 641 | } |
641 | 642 | ||
642 | if (ret) | 643 | if (ret) { |
644 | /* | ||
645 | * our io error hook is going to dec the io pages | ||
646 | * again, we have to make sure it has something | ||
647 | * to decrement | ||
648 | */ | ||
649 | atomic_inc(&eb->io_pages); | ||
643 | clear_extent_buffer_uptodate(eb); | 650 | clear_extent_buffer_uptodate(eb); |
651 | } | ||
644 | free_extent_buffer(eb); | 652 | free_extent_buffer(eb); |
645 | out: | 653 | out: |
646 | return ret; | 654 | return ret; |
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) | |||
654 | eb = (struct extent_buffer *)page->private; | 662 | eb = (struct extent_buffer *)page->private; |
655 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); | 663 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); |
656 | eb->read_mirror = failed_mirror; | 664 | eb->read_mirror = failed_mirror; |
665 | atomic_dec(&eb->io_pages); | ||
657 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) | 666 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) |
658 | btree_readahead_hook(root, eb, eb->start, -EIO); | 667 | btree_readahead_hook(root, eb, eb->start, -EIO); |
659 | return -EIO; /* we fixed nothing */ | 668 | return -EIO; /* we fixed nothing */ |
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
670 | end_io_wq->work.flags = 0; | 679 | end_io_wq->work.flags = 0; |
671 | 680 | ||
672 | if (bio->bi_rw & REQ_WRITE) { | 681 | if (bio->bi_rw & REQ_WRITE) { |
673 | if (end_io_wq->metadata == 1) | 682 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) |
674 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, | 683 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, |
675 | &end_io_wq->work); | 684 | &end_io_wq->work); |
676 | else if (end_io_wq->metadata == 2) | 685 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) |
677 | btrfs_queue_worker(&fs_info->endio_freespace_worker, | 686 | btrfs_queue_worker(&fs_info->endio_freespace_worker, |
678 | &end_io_wq->work); | 687 | &end_io_wq->work); |
688 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) | ||
689 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
690 | &end_io_wq->work); | ||
679 | else | 691 | else |
680 | btrfs_queue_worker(&fs_info->endio_write_workers, | 692 | btrfs_queue_worker(&fs_info->endio_write_workers, |
681 | &end_io_wq->work); | 693 | &end_io_wq->work); |
682 | } else { | 694 | } else { |
683 | if (end_io_wq->metadata) | 695 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) |
696 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
697 | &end_io_wq->work); | ||
698 | else if (end_io_wq->metadata) | ||
684 | btrfs_queue_worker(&fs_info->endio_meta_workers, | 699 | btrfs_queue_worker(&fs_info->endio_meta_workers, |
685 | &end_io_wq->work); | 700 | &end_io_wq->work); |
686 | else | 701 | else |
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
695 | * 0 - if data | 710 | * 0 - if data |
696 | * 1 - if normal metadta | 711 | * 1 - if normal metadta |
697 | * 2 - if writing to the free space cache area | 712 | * 2 - if writing to the free space cache area |
713 | * 3 - raid parity work | ||
698 | */ | 714 | */ |
699 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 715 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
700 | int metadata) | 716 | int metadata) |
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb, | |||
2165 | init_waitqueue_head(&fs_info->transaction_blocked_wait); | 2181 | init_waitqueue_head(&fs_info->transaction_blocked_wait); |
2166 | init_waitqueue_head(&fs_info->async_submit_wait); | 2182 | init_waitqueue_head(&fs_info->async_submit_wait); |
2167 | 2183 | ||
2184 | ret = btrfs_alloc_stripe_hash_table(fs_info); | ||
2185 | if (ret) { | ||
2186 | err = -ENOMEM; | ||
2187 | goto fail_alloc; | ||
2188 | } | ||
2189 | |||
2168 | __setup_root(4096, 4096, 4096, 4096, tree_root, | 2190 | __setup_root(4096, 4096, 4096, 4096, tree_root, |
2169 | fs_info, BTRFS_ROOT_TREE_OBJECTID); | 2191 | fs_info, BTRFS_ROOT_TREE_OBJECTID); |
2170 | 2192 | ||
@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb, | |||
2332 | btrfs_init_workers(&fs_info->endio_meta_write_workers, | 2354 | btrfs_init_workers(&fs_info->endio_meta_write_workers, |
2333 | "endio-meta-write", fs_info->thread_pool_size, | 2355 | "endio-meta-write", fs_info->thread_pool_size, |
2334 | &fs_info->generic_worker); | 2356 | &fs_info->generic_worker); |
2357 | btrfs_init_workers(&fs_info->endio_raid56_workers, | ||
2358 | "endio-raid56", fs_info->thread_pool_size, | ||
2359 | &fs_info->generic_worker); | ||
2360 | btrfs_init_workers(&fs_info->rmw_workers, | ||
2361 | "rmw", fs_info->thread_pool_size, | ||
2362 | &fs_info->generic_worker); | ||
2335 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | 2363 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", |
2336 | fs_info->thread_pool_size, | 2364 | fs_info->thread_pool_size, |
2337 | &fs_info->generic_worker); | 2365 | &fs_info->generic_worker); |
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb, | |||
2350 | */ | 2378 | */ |
2351 | fs_info->endio_workers.idle_thresh = 4; | 2379 | fs_info->endio_workers.idle_thresh = 4; |
2352 | fs_info->endio_meta_workers.idle_thresh = 4; | 2380 | fs_info->endio_meta_workers.idle_thresh = 4; |
2381 | fs_info->endio_raid56_workers.idle_thresh = 4; | ||
2382 | fs_info->rmw_workers.idle_thresh = 2; | ||
2353 | 2383 | ||
2354 | fs_info->endio_write_workers.idle_thresh = 2; | 2384 | fs_info->endio_write_workers.idle_thresh = 2; |
2355 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2385 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb, | |||
2366 | ret |= btrfs_start_workers(&fs_info->fixup_workers); | 2396 | ret |= btrfs_start_workers(&fs_info->fixup_workers); |
2367 | ret |= btrfs_start_workers(&fs_info->endio_workers); | 2397 | ret |= btrfs_start_workers(&fs_info->endio_workers); |
2368 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); | 2398 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); |
2399 | ret |= btrfs_start_workers(&fs_info->rmw_workers); | ||
2400 | ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); | ||
2369 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); | 2401 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); |
2370 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); | 2402 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); |
2371 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); | 2403 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); |
@@ -2710,6 +2742,8 @@ fail_sb_buffer: | |||
2710 | btrfs_stop_workers(&fs_info->workers); | 2742 | btrfs_stop_workers(&fs_info->workers); |
2711 | btrfs_stop_workers(&fs_info->endio_workers); | 2743 | btrfs_stop_workers(&fs_info->endio_workers); |
2712 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 2744 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
2745 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
2746 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
2713 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 2747 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
2714 | btrfs_stop_workers(&fs_info->endio_write_workers); | 2748 | btrfs_stop_workers(&fs_info->endio_write_workers); |
2715 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 2749 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -2728,6 +2762,7 @@ fail_bdi: | |||
2728 | fail_srcu: | 2762 | fail_srcu: |
2729 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2763 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2730 | fail: | 2764 | fail: |
2765 | btrfs_free_stripe_hash_table(fs_info); | ||
2731 | btrfs_close_devices(fs_info->fs_devices); | 2766 | btrfs_close_devices(fs_info->fs_devices); |
2732 | return err; | 2767 | return err; |
2733 | 2768 | ||
@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( | |||
3076 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) | 3111 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) |
3077 | == 0))) | 3112 | == 0))) |
3078 | num_tolerated_disk_barrier_failures = 0; | 3113 | num_tolerated_disk_barrier_failures = 0; |
3079 | else if (num_tolerated_disk_barrier_failures > 1 | 3114 | else if (num_tolerated_disk_barrier_failures > 1) { |
3080 | && | 3115 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | |
3081 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3116 | BTRFS_BLOCK_GROUP_RAID5 | |
3082 | BTRFS_BLOCK_GROUP_RAID10))) | 3117 | BTRFS_BLOCK_GROUP_RAID10)) { |
3083 | num_tolerated_disk_barrier_failures = 1; | 3118 | num_tolerated_disk_barrier_failures = 1; |
3119 | } else if (flags & | ||
3120 | BTRFS_BLOCK_GROUP_RAID5) { | ||
3121 | num_tolerated_disk_barrier_failures = 2; | ||
3122 | } | ||
3123 | } | ||
3084 | } | 3124 | } |
3085 | } | 3125 | } |
3086 | up_read(&sinfo->groups_sem); | 3126 | up_read(&sinfo->groups_sem); |
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root) | |||
3384 | btrfs_stop_workers(&fs_info->workers); | 3424 | btrfs_stop_workers(&fs_info->workers); |
3385 | btrfs_stop_workers(&fs_info->endio_workers); | 3425 | btrfs_stop_workers(&fs_info->endio_workers); |
3386 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 3426 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
3427 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
3428 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
3387 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 3429 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
3388 | btrfs_stop_workers(&fs_info->endio_write_workers); | 3430 | btrfs_stop_workers(&fs_info->endio_write_workers); |
3389 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 3431 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root) | |||
3404 | bdi_destroy(&fs_info->bdi); | 3446 | bdi_destroy(&fs_info->bdi); |
3405 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3447 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
3406 | 3448 | ||
3449 | btrfs_free_stripe_hash_table(fs_info); | ||
3450 | |||
3407 | return 0; | 3451 | return 0; |
3408 | } | 3452 | } |
3409 | 3453 | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -25,6 +25,13 @@ | |||
25 | #define BTRFS_SUPER_MIRROR_MAX 3 | 25 | #define BTRFS_SUPER_MIRROR_MAX 3 |
26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 | 26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 |
27 | 27 | ||
28 | enum { | ||
29 | BTRFS_WQ_ENDIO_DATA = 0, | ||
30 | BTRFS_WQ_ENDIO_METADATA = 1, | ||
31 | BTRFS_WQ_ENDIO_FREE_SPACE = 2, | ||
32 | BTRFS_WQ_ENDIO_RAID56 = 3, | ||
33 | }; | ||
34 | |||
28 | static inline u64 btrfs_sb_offset(int mirror) | 35 | static inline u64 btrfs_sb_offset(int mirror) |
29 | { | 36 | { |
30 | u64 start = 16 * 1024; | 37 | u64 start = 16 * 1024; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd449..3345f68fc64b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include "print-tree.h" | 31 | #include "print-tree.h" |
32 | #include "transaction.h" | 32 | #include "transaction.h" |
33 | #include "volumes.h" | 33 | #include "volumes.h" |
34 | #include "raid56.h" | ||
34 | #include "locking.h" | 35 | #include "locking.h" |
35 | #include "free-space-cache.h" | 36 | #include "free-space-cache.h" |
36 | #include "math.h" | 37 | #include "math.h" |
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1852 | *actual_bytes = discarded_bytes; | 1853 | *actual_bytes = discarded_bytes; |
1853 | 1854 | ||
1854 | 1855 | ||
1856 | if (ret == -EOPNOTSUPP) | ||
1857 | ret = 0; | ||
1855 | return ret; | 1858 | return ret; |
1856 | } | 1859 | } |
1857 | 1860 | ||
@@ -3276,6 +3279,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3276 | u64 num_devices = root->fs_info->fs_devices->rw_devices + | 3279 | u64 num_devices = root->fs_info->fs_devices->rw_devices + |
3277 | root->fs_info->fs_devices->missing_devices; | 3280 | root->fs_info->fs_devices->missing_devices; |
3278 | u64 target; | 3281 | u64 target; |
3282 | u64 tmp; | ||
3279 | 3283 | ||
3280 | /* | 3284 | /* |
3281 | * see if restripe for this chunk_type is in progress, if so | 3285 | * see if restripe for this chunk_type is in progress, if so |
@@ -3292,30 +3296,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3292 | } | 3296 | } |
3293 | spin_unlock(&root->fs_info->balance_lock); | 3297 | spin_unlock(&root->fs_info->balance_lock); |
3294 | 3298 | ||
3299 | /* First, mask out the RAID levels which aren't possible */ | ||
3295 | if (num_devices == 1) | 3300 | if (num_devices == 1) |
3296 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | 3301 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | |
3302 | BTRFS_BLOCK_GROUP_RAID5); | ||
3303 | if (num_devices < 3) | ||
3304 | flags &= ~BTRFS_BLOCK_GROUP_RAID6; | ||
3297 | if (num_devices < 4) | 3305 | if (num_devices < 4) |
3298 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; | 3306 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; |
3299 | 3307 | ||
3300 | if ((flags & BTRFS_BLOCK_GROUP_DUP) && | 3308 | tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | |
3301 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3309 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | |
3302 | BTRFS_BLOCK_GROUP_RAID10))) { | 3310 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); |
3303 | flags &= ~BTRFS_BLOCK_GROUP_DUP; | 3311 | flags &= ~tmp; |
3304 | } | ||
3305 | |||
3306 | if ((flags & BTRFS_BLOCK_GROUP_RAID1) && | ||
3307 | (flags & BTRFS_BLOCK_GROUP_RAID10)) { | ||
3308 | flags &= ~BTRFS_BLOCK_GROUP_RAID1; | ||
3309 | } | ||
3310 | 3312 | ||
3311 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | 3313 | if (tmp & BTRFS_BLOCK_GROUP_RAID6) |
3312 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | 3314 | tmp = BTRFS_BLOCK_GROUP_RAID6; |
3313 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | 3315 | else if (tmp & BTRFS_BLOCK_GROUP_RAID5) |
3314 | (flags & BTRFS_BLOCK_GROUP_DUP))) { | 3316 | tmp = BTRFS_BLOCK_GROUP_RAID5; |
3315 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | 3317 | else if (tmp & BTRFS_BLOCK_GROUP_RAID10) |
3316 | } | 3318 | tmp = BTRFS_BLOCK_GROUP_RAID10; |
3319 | else if (tmp & BTRFS_BLOCK_GROUP_RAID1) | ||
3320 | tmp = BTRFS_BLOCK_GROUP_RAID1; | ||
3321 | else if (tmp & BTRFS_BLOCK_GROUP_RAID0) | ||
3322 | tmp = BTRFS_BLOCK_GROUP_RAID0; | ||
3317 | 3323 | ||
3318 | return extended_to_chunk(flags); | 3324 | return extended_to_chunk(flags | tmp); |
3319 | } | 3325 | } |
3320 | 3326 | ||
3321 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | 3327 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) |
@@ -3333,6 +3339,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3333 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | 3339 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) |
3334 | { | 3340 | { |
3335 | u64 flags; | 3341 | u64 flags; |
3342 | u64 ret; | ||
3336 | 3343 | ||
3337 | if (data) | 3344 | if (data) |
3338 | flags = BTRFS_BLOCK_GROUP_DATA; | 3345 | flags = BTRFS_BLOCK_GROUP_DATA; |
@@ -3341,7 +3348,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | |||
3341 | else | 3348 | else |
3342 | flags = BTRFS_BLOCK_GROUP_METADATA; | 3349 | flags = BTRFS_BLOCK_GROUP_METADATA; |
3343 | 3350 | ||
3344 | return get_alloc_profile(root, flags); | 3351 | ret = get_alloc_profile(root, flags); |
3352 | return ret; | ||
3345 | } | 3353 | } |
3346 | 3354 | ||
3347 | /* | 3355 | /* |
@@ -3516,8 +3524,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) | |||
3516 | { | 3524 | { |
3517 | u64 num_dev; | 3525 | u64 num_dev; |
3518 | 3526 | ||
3519 | if (type & BTRFS_BLOCK_GROUP_RAID10 || | 3527 | if (type & (BTRFS_BLOCK_GROUP_RAID10 | |
3520 | type & BTRFS_BLOCK_GROUP_RAID0) | 3528 | BTRFS_BLOCK_GROUP_RAID0 | |
3529 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3530 | BTRFS_BLOCK_GROUP_RAID6)) | ||
3521 | num_dev = root->fs_info->fs_devices->rw_devices; | 3531 | num_dev = root->fs_info->fs_devices->rw_devices; |
3522 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | 3532 | else if (type & BTRFS_BLOCK_GROUP_RAID1) |
3523 | num_dev = 2; | 3533 | num_dev = 2; |
@@ -3667,7 +3677,9 @@ static int can_overcommit(struct btrfs_root *root, | |||
3667 | 3677 | ||
3668 | /* | 3678 | /* |
3669 | * If we have dup, raid1 or raid10 then only half of the free | 3679 | * If we have dup, raid1 or raid10 then only half of the free |
3670 | * space is actually useable. | 3680 | * space is actually useable. For raid56, the space info used |
3681 | * doesn't include the parity drive, so we don't have to | ||
3682 | * change the math | ||
3671 | */ | 3683 | */ |
3672 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | 3684 | if (profile & (BTRFS_BLOCK_GROUP_DUP | |
3673 | BTRFS_BLOCK_GROUP_RAID1 | | 3685 | BTRFS_BLOCK_GROUP_RAID1 | |
@@ -5455,10 +5467,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
5455 | return ret; | 5467 | return ret; |
5456 | } | 5468 | } |
5457 | 5469 | ||
5458 | static u64 stripe_align(struct btrfs_root *root, u64 val) | 5470 | static u64 stripe_align(struct btrfs_root *root, |
5471 | struct btrfs_block_group_cache *cache, | ||
5472 | u64 val, u64 num_bytes) | ||
5459 | { | 5473 | { |
5460 | u64 mask = ((u64)root->stripesize - 1); | 5474 | u64 mask; |
5461 | u64 ret = (val + mask) & ~mask; | 5475 | u64 ret; |
5476 | mask = ((u64)root->stripesize - 1); | ||
5477 | ret = (val + mask) & ~mask; | ||
5462 | return ret; | 5478 | return ret; |
5463 | } | 5479 | } |
5464 | 5480 | ||
@@ -5519,9 +5535,12 @@ int __get_raid_index(u64 flags) | |||
5519 | index = 2; | 5535 | index = 2; |
5520 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) | 5536 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) |
5521 | index = 3; | 5537 | index = 3; |
5538 | else if (flags & BTRFS_BLOCK_GROUP_RAID5) | ||
5539 | index = 5; | ||
5540 | else if (flags & BTRFS_BLOCK_GROUP_RAID6) | ||
5541 | index = 6; | ||
5522 | else | 5542 | else |
5523 | index = 4; | 5543 | index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */ |
5524 | |||
5525 | return index; | 5544 | return index; |
5526 | } | 5545 | } |
5527 | 5546 | ||
@@ -5665,6 +5684,8 @@ search: | |||
5665 | if (!block_group_bits(block_group, data)) { | 5684 | if (!block_group_bits(block_group, data)) { |
5666 | u64 extra = BTRFS_BLOCK_GROUP_DUP | | 5685 | u64 extra = BTRFS_BLOCK_GROUP_DUP | |
5667 | BTRFS_BLOCK_GROUP_RAID1 | | 5686 | BTRFS_BLOCK_GROUP_RAID1 | |
5687 | BTRFS_BLOCK_GROUP_RAID5 | | ||
5688 | BTRFS_BLOCK_GROUP_RAID6 | | ||
5668 | BTRFS_BLOCK_GROUP_RAID10; | 5689 | BTRFS_BLOCK_GROUP_RAID10; |
5669 | 5690 | ||
5670 | /* | 5691 | /* |
@@ -5835,7 +5856,8 @@ unclustered_alloc: | |||
5835 | goto loop; | 5856 | goto loop; |
5836 | } | 5857 | } |
5837 | checks: | 5858 | checks: |
5838 | search_start = stripe_align(root, offset); | 5859 | search_start = stripe_align(root, used_block_group, |
5860 | offset, num_bytes); | ||
5839 | 5861 | ||
5840 | /* move on to the next group */ | 5862 | /* move on to the next group */ |
5841 | if (search_start + num_bytes > | 5863 | if (search_start + num_bytes > |
@@ -7203,6 +7225,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | |||
7203 | root->fs_info->fs_devices->missing_devices; | 7225 | root->fs_info->fs_devices->missing_devices; |
7204 | 7226 | ||
7205 | stripped = BTRFS_BLOCK_GROUP_RAID0 | | 7227 | stripped = BTRFS_BLOCK_GROUP_RAID0 | |
7228 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7206 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | 7229 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; |
7207 | 7230 | ||
7208 | if (num_devices == 1) { | 7231 | if (num_devices == 1) { |
@@ -7754,7 +7777,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7754 | btrfs_release_path(path); | 7777 | btrfs_release_path(path); |
7755 | cache->flags = btrfs_block_group_flags(&cache->item); | 7778 | cache->flags = btrfs_block_group_flags(&cache->item); |
7756 | cache->sectorsize = root->sectorsize; | 7779 | cache->sectorsize = root->sectorsize; |
7757 | 7780 | cache->full_stripe_len = btrfs_full_stripe_len(root, | |
7781 | &root->fs_info->mapping_tree, | ||
7782 | found_key.objectid); | ||
7758 | btrfs_init_free_space_ctl(cache); | 7783 | btrfs_init_free_space_ctl(cache); |
7759 | 7784 | ||
7760 | /* | 7785 | /* |
@@ -7808,6 +7833,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7808 | if (!(get_alloc_profile(root, space_info->flags) & | 7833 | if (!(get_alloc_profile(root, space_info->flags) & |
7809 | (BTRFS_BLOCK_GROUP_RAID10 | | 7834 | (BTRFS_BLOCK_GROUP_RAID10 | |
7810 | BTRFS_BLOCK_GROUP_RAID1 | | 7835 | BTRFS_BLOCK_GROUP_RAID1 | |
7836 | BTRFS_BLOCK_GROUP_RAID5 | | ||
7837 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7811 | BTRFS_BLOCK_GROUP_DUP))) | 7838 | BTRFS_BLOCK_GROUP_DUP))) |
7812 | continue; | 7839 | continue; |
7813 | /* | 7840 | /* |
@@ -7883,6 +7910,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
7883 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; | 7910 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; |
7884 | cache->sectorsize = root->sectorsize; | 7911 | cache->sectorsize = root->sectorsize; |
7885 | cache->fs_info = root->fs_info; | 7912 | cache->fs_info = root->fs_info; |
7913 | cache->full_stripe_len = btrfs_full_stripe_len(root, | ||
7914 | &root->fs_info->mapping_tree, | ||
7915 | chunk_offset); | ||
7886 | 7916 | ||
7887 | atomic_set(&cache->count, 1); | 7917 | atomic_set(&cache->count, 1); |
7888 | spin_lock_init(&cache->lock); | 7918 | spin_lock_init(&cache->lock); |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 86ecca48c604..3b9fb478b0d1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | |||
1895 | if (ret) | 1895 | if (ret) |
1896 | err = ret; | 1896 | err = ret; |
1897 | 1897 | ||
1898 | if (did_repair) { | 1898 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, |
1899 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | 1899 | rec->start + rec->len - 1, |
1900 | rec->start + rec->len - 1, | 1900 | EXTENT_DAMAGED, GFP_NOFS); |
1901 | EXTENT_DAMAGED, GFP_NOFS); | 1901 | if (ret && !err) |
1902 | if (ret && !err) | 1902 | err = ret; |
1903 | err = ret; | ||
1904 | } | ||
1905 | 1903 | ||
1906 | kfree(rec); | 1904 | kfree(rec); |
1907 | return err; | 1905 | return err; |
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
1932 | u64 map_length = 0; | 1930 | u64 map_length = 0; |
1933 | u64 sector; | 1931 | u64 sector; |
1934 | struct btrfs_bio *bbio = NULL; | 1932 | struct btrfs_bio *bbio = NULL; |
1933 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
1935 | int ret; | 1934 | int ret; |
1936 | 1935 | ||
1937 | BUG_ON(!mirror_num); | 1936 | BUG_ON(!mirror_num); |
1938 | 1937 | ||
1938 | /* we can't repair anything in raid56 yet */ | ||
1939 | if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) | ||
1940 | return 0; | ||
1941 | |||
1939 | bio = bio_alloc(GFP_NOFS, 1); | 1942 | bio = bio_alloc(GFP_NOFS, 1); |
1940 | if (!bio) | 1943 | if (!bio) |
1941 | return -EIO; | 1944 | return -EIO; |
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page) | |||
2052 | failrec->failed_mirror); | 2055 | failrec->failed_mirror); |
2053 | did_repair = !ret; | 2056 | did_repair = !ret; |
2054 | } | 2057 | } |
2058 | ret = 0; | ||
2055 | } | 2059 | } |
2056 | 2060 | ||
2057 | out: | 2061 | out: |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c9..62020b7f7036 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -1463,10 +1463,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1463 | } | 1463 | } |
1464 | 1464 | ||
1465 | static struct btrfs_free_space * | 1465 | static struct btrfs_free_space * |
1466 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | 1466 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, |
1467 | unsigned long align) | ||
1467 | { | 1468 | { |
1468 | struct btrfs_free_space *entry; | 1469 | struct btrfs_free_space *entry; |
1469 | struct rb_node *node; | 1470 | struct rb_node *node; |
1471 | u64 ctl_off; | ||
1472 | u64 tmp; | ||
1473 | u64 align_off; | ||
1470 | int ret; | 1474 | int ret; |
1471 | 1475 | ||
1472 | if (!ctl->free_space_offset.rb_node) | 1476 | if (!ctl->free_space_offset.rb_node) |
@@ -1481,15 +1485,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | |||
1481 | if (entry->bytes < *bytes) | 1485 | if (entry->bytes < *bytes) |
1482 | continue; | 1486 | continue; |
1483 | 1487 | ||
1488 | /* make sure the space returned is big enough | ||
1489 | * to match our requested alignment | ||
1490 | */ | ||
1491 | if (*bytes >= align) { | ||
1492 | ctl_off = entry->offset - ctl->start; | ||
1493 | tmp = ctl_off + align - 1;; | ||
1494 | do_div(tmp, align); | ||
1495 | tmp = tmp * align + ctl->start; | ||
1496 | align_off = tmp - entry->offset; | ||
1497 | } else { | ||
1498 | align_off = 0; | ||
1499 | tmp = entry->offset; | ||
1500 | } | ||
1501 | |||
1502 | if (entry->bytes < *bytes + align_off) | ||
1503 | continue; | ||
1504 | |||
1484 | if (entry->bitmap) { | 1505 | if (entry->bitmap) { |
1485 | ret = search_bitmap(ctl, entry, offset, bytes); | 1506 | ret = search_bitmap(ctl, entry, &tmp, bytes); |
1486 | if (!ret) | 1507 | if (!ret) { |
1508 | *offset = tmp; | ||
1487 | return entry; | 1509 | return entry; |
1510 | } | ||
1488 | continue; | 1511 | continue; |
1489 | } | 1512 | } |
1490 | 1513 | ||
1491 | *offset = entry->offset; | 1514 | *offset = tmp; |
1492 | *bytes = entry->bytes; | 1515 | *bytes = entry->bytes - align_off; |
1493 | return entry; | 1516 | return entry; |
1494 | } | 1517 | } |
1495 | 1518 | ||
@@ -2091,9 +2114,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2091 | struct btrfs_free_space *entry = NULL; | 2114 | struct btrfs_free_space *entry = NULL; |
2092 | u64 bytes_search = bytes + empty_size; | 2115 | u64 bytes_search = bytes + empty_size; |
2093 | u64 ret = 0; | 2116 | u64 ret = 0; |
2117 | u64 align_gap = 0; | ||
2118 | u64 align_gap_len = 0; | ||
2094 | 2119 | ||
2095 | spin_lock(&ctl->tree_lock); | 2120 | spin_lock(&ctl->tree_lock); |
2096 | entry = find_free_space(ctl, &offset, &bytes_search); | 2121 | entry = find_free_space(ctl, &offset, &bytes_search, |
2122 | block_group->full_stripe_len); | ||
2097 | if (!entry) | 2123 | if (!entry) |
2098 | goto out; | 2124 | goto out; |
2099 | 2125 | ||
@@ -2103,9 +2129,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2103 | if (!entry->bytes) | 2129 | if (!entry->bytes) |
2104 | free_bitmap(ctl, entry); | 2130 | free_bitmap(ctl, entry); |
2105 | } else { | 2131 | } else { |
2132 | |||
2106 | unlink_free_space(ctl, entry); | 2133 | unlink_free_space(ctl, entry); |
2107 | entry->offset += bytes; | 2134 | align_gap_len = offset - entry->offset; |
2108 | entry->bytes -= bytes; | 2135 | align_gap = entry->offset; |
2136 | |||
2137 | entry->offset = offset + bytes; | ||
2138 | WARN_ON(entry->bytes < bytes + align_gap_len); | ||
2139 | |||
2140 | entry->bytes -= bytes + align_gap_len; | ||
2109 | if (!entry->bytes) | 2141 | if (!entry->bytes) |
2110 | kmem_cache_free(btrfs_free_space_cachep, entry); | 2142 | kmem_cache_free(btrfs_free_space_cachep, entry); |
2111 | else | 2143 | else |
@@ -2115,6 +2147,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2115 | out: | 2147 | out: |
2116 | spin_unlock(&ctl->tree_lock); | 2148 | spin_unlock(&ctl->tree_lock); |
2117 | 2149 | ||
2150 | if (align_gap_len) | ||
2151 | __btrfs_add_free_space(ctl, align_gap, align_gap_len); | ||
2118 | return ret; | 2152 | return ret; |
2119 | } | 2153 | } |
2120 | 2154 | ||
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1b98c4ce3c6f..6f4e41dca970 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
40 | #include <linux/ratelimit.h> | 40 | #include <linux/ratelimit.h> |
41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
42 | #include <linux/blkdev.h> | ||
42 | #include "compat.h" | 43 | #include "compat.h" |
43 | #include "ctree.h" | 44 | #include "ctree.h" |
44 | #include "disk-io.h" | 45 | #include "disk-io.h" |
@@ -6386,19 +6387,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6386 | int async_submit = 0; | 6387 | int async_submit = 0; |
6387 | 6388 | ||
6388 | map_length = orig_bio->bi_size; | 6389 | map_length = orig_bio->bi_size; |
6389 | ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, | 6390 | ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, |
6390 | &map_length, NULL, 0); | 6391 | &map_length, NULL, 0); |
6391 | if (ret) { | 6392 | if (ret) { |
6392 | bio_put(orig_bio); | 6393 | bio_put(orig_bio); |
6393 | return -EIO; | 6394 | return -EIO; |
6394 | } | 6395 | } |
6395 | |||
6396 | if (map_length >= orig_bio->bi_size) { | 6396 | if (map_length >= orig_bio->bi_size) { |
6397 | bio = orig_bio; | 6397 | bio = orig_bio; |
6398 | goto submit; | 6398 | goto submit; |
6399 | } | 6399 | } |
6400 | 6400 | ||
6401 | async_submit = 1; | 6401 | /* async crcs make it difficult to collect full stripe writes. */ |
6402 | if (btrfs_get_alloc_profile(root, 1) & | ||
6403 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) | ||
6404 | async_submit = 0; | ||
6405 | else | ||
6406 | async_submit = 1; | ||
6407 | |||
6402 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); | 6408 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); |
6403 | if (!bio) | 6409 | if (!bio) |
6404 | return -ENOMEM; | 6410 | return -ENOMEM; |
@@ -6440,7 +6446,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6440 | bio->bi_end_io = btrfs_end_dio_bio; | 6446 | bio->bi_end_io = btrfs_end_dio_bio; |
6441 | 6447 | ||
6442 | map_length = orig_bio->bi_size; | 6448 | map_length = orig_bio->bi_size; |
6443 | ret = btrfs_map_block(root->fs_info, READ, | 6449 | ret = btrfs_map_block(root->fs_info, rw, |
6444 | start_sector << 9, | 6450 | start_sector << 9, |
6445 | &map_length, NULL, 0); | 6451 | &map_length, NULL, 0); |
6446 | if (ret) { | 6452 | if (ret) { |
@@ -6583,15 +6589,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, | |||
6583 | { | 6589 | { |
6584 | struct file *file = iocb->ki_filp; | 6590 | struct file *file = iocb->ki_filp; |
6585 | struct inode *inode = file->f_mapping->host; | 6591 | struct inode *inode = file->f_mapping->host; |
6592 | ssize_t ret; | ||
6586 | 6593 | ||
6587 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, | 6594 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, |
6588 | offset, nr_segs)) | 6595 | offset, nr_segs)) |
6589 | return 0; | 6596 | return 0; |
6590 | 6597 | ||
6591 | return __blockdev_direct_IO(rw, iocb, inode, | 6598 | ret = __blockdev_direct_IO(rw, iocb, inode, |
6592 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, | 6599 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, |
6593 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, | 6600 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, |
6594 | btrfs_submit_direct, 0); | 6601 | btrfs_submit_direct, 0); |
6602 | return ret; | ||
6595 | } | 6603 | } |
6596 | 6604 | ||
6597 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) | 6605 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..d02510f34936 --- /dev/null +++ b/fs/btrfs/raid56.c | |||
@@ -0,0 +1,1647 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/wait.h> | ||
21 | #include <linux/bio.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | #include <linux/blkdev.h> | ||
25 | #include <linux/random.h> | ||
26 | #include <linux/iocontext.h> | ||
27 | #include <linux/capability.h> | ||
28 | #include <linux/ratelimit.h> | ||
29 | #include <linux/kthread.h> | ||
30 | #include <linux/raid/pq.h> | ||
31 | #include <linux/hash.h> | ||
32 | #include <linux/list_sort.h> | ||
33 | #include <linux/raid/xor.h> | ||
34 | #include <asm/div64.h> | ||
35 | #include "compat.h" | ||
36 | #include "ctree.h" | ||
37 | #include "extent_map.h" | ||
38 | #include "disk-io.h" | ||
39 | #include "transaction.h" | ||
40 | #include "print-tree.h" | ||
41 | #include "volumes.h" | ||
42 | #include "raid56.h" | ||
43 | #include "async-thread.h" | ||
44 | #include "check-integrity.h" | ||
45 | #include "rcu-string.h" | ||
46 | |||
47 | /* set when additional merges to this rbio are not allowed */ | ||
48 | #define RBIO_RMW_LOCKED_BIT 1 | ||
49 | |||
50 | struct btrfs_raid_bio { | ||
51 | struct btrfs_fs_info *fs_info; | ||
52 | struct btrfs_bio *bbio; | ||
53 | |||
54 | /* | ||
55 | * logical block numbers for the start of each stripe | ||
56 | * The last one or two are p/q. These are sorted, | ||
57 | * so raid_map[0] is the start of our full stripe | ||
58 | */ | ||
59 | u64 *raid_map; | ||
60 | |||
61 | /* while we're doing rmw on a stripe | ||
62 | * we put it into a hash table so we can | ||
63 | * lock the stripe and merge more rbios | ||
64 | * into it. | ||
65 | */ | ||
66 | struct list_head hash_list; | ||
67 | |||
68 | /* | ||
69 | * for scheduling work in the helper threads | ||
70 | */ | ||
71 | struct btrfs_work work; | ||
72 | |||
73 | /* | ||
74 | * bio list and bio_list_lock are used | ||
75 | * to add more bios into the stripe | ||
76 | * in hopes of avoiding the full rmw | ||
77 | */ | ||
78 | struct bio_list bio_list; | ||
79 | spinlock_t bio_list_lock; | ||
80 | |||
81 | /* | ||
82 | * also protected by the bio_list_lock, the | ||
83 | * stripe locking code uses plug_list to hand off | ||
84 | * the stripe lock to the next pending IO | ||
85 | */ | ||
86 | struct list_head plug_list; | ||
87 | |||
88 | /* | ||
89 | * flags that tell us if it is safe to | ||
90 | * merge with this bio | ||
91 | */ | ||
92 | unsigned long flags; | ||
93 | |||
94 | /* size of each individual stripe on disk */ | ||
95 | int stripe_len; | ||
96 | |||
97 | /* number of data stripes (no p/q) */ | ||
98 | int nr_data; | ||
99 | |||
100 | /* | ||
101 | * set if we're doing a parity rebuild | ||
102 | * for a read from higher up, which is handled | ||
103 | * differently from a parity rebuild as part of | ||
104 | * rmw | ||
105 | */ | ||
106 | int read_rebuild; | ||
107 | |||
108 | /* first bad stripe */ | ||
109 | int faila; | ||
110 | |||
111 | /* second bad stripe (for raid6 use) */ | ||
112 | int failb; | ||
113 | |||
114 | /* | ||
115 | * number of pages needed to represent the full | ||
116 | * stripe | ||
117 | */ | ||
118 | int nr_pages; | ||
119 | |||
120 | /* | ||
121 | * size of all the bios in the bio_list. This | ||
122 | * helps us decide if the rbio maps to a full | ||
123 | * stripe or not | ||
124 | */ | ||
125 | int bio_list_bytes; | ||
126 | |||
127 | atomic_t refs; | ||
128 | |||
129 | /* | ||
130 | * these are two arrays of pointers. We allocate the | ||
131 | * rbio big enough to hold them both and setup their | ||
132 | * locations when the rbio is allocated | ||
133 | */ | ||
134 | |||
135 | /* pointers to pages that we allocated for | ||
136 | * reading/writing stripes directly from the disk (including P/Q) | ||
137 | */ | ||
138 | struct page **stripe_pages; | ||
139 | |||
140 | /* | ||
141 | * pointers to the pages in the bio_list. Stored | ||
142 | * here for faster lookup | ||
143 | */ | ||
144 | struct page **bio_pages; | ||
145 | }; | ||
146 | |||
147 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | ||
148 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio); | ||
149 | static void rmw_work(struct btrfs_work *work); | ||
150 | static void read_rebuild_work(struct btrfs_work *work); | ||
151 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio); | ||
152 | static void async_read_rebuild(struct btrfs_raid_bio *rbio); | ||
153 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); | ||
154 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); | ||
155 | static void __free_raid_bio(struct btrfs_raid_bio *rbio); | ||
156 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | ||
157 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | ||
158 | |||
159 | /* | ||
160 | * the stripe hash table is used for locking, and to collect | ||
161 | * bios in hopes of making a full stripe | ||
162 | */ | ||
163 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) | ||
164 | { | ||
165 | struct btrfs_stripe_hash_table *table; | ||
166 | struct btrfs_stripe_hash_table *x; | ||
167 | struct btrfs_stripe_hash *cur; | ||
168 | struct btrfs_stripe_hash *h; | ||
169 | int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; | ||
170 | int i; | ||
171 | |||
172 | if (info->stripe_hash_table) | ||
173 | return 0; | ||
174 | |||
175 | table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); | ||
176 | if (!table) | ||
177 | return -ENOMEM; | ||
178 | |||
179 | table->table = (void *)(table + 1); | ||
180 | h = table->table; | ||
181 | |||
182 | for (i = 0; i < num_entries; i++) { | ||
183 | cur = h + i; | ||
184 | INIT_LIST_HEAD(&cur->hash_list); | ||
185 | spin_lock_init(&cur->lock); | ||
186 | init_waitqueue_head(&cur->wait); | ||
187 | } | ||
188 | |||
189 | x = cmpxchg(&info->stripe_hash_table, NULL, table); | ||
190 | if (x) | ||
191 | kfree(x); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * we hash on the first logical address of the stripe | ||
197 | */ | ||
198 | static int rbio_bucket(struct btrfs_raid_bio *rbio) | ||
199 | { | ||
200 | u64 num = rbio->raid_map[0]; | ||
201 | |||
202 | /* | ||
203 | * we shift down quite a bit. We're using byte | ||
204 | * addressing, and most of the lower bits are zeros. | ||
205 | * This tends to upset hash_64, and it consistently | ||
206 | * returns just one or two different values. | ||
207 | * | ||
208 | * shifting off the lower bits fixes things. | ||
209 | */ | ||
210 | return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); | ||
211 | } | ||
212 | |||
213 | /* | ||
214 | * merging means we take the bio_list from the victim and | ||
215 | * splice it into the destination. The victim should | ||
216 | * be discarded afterwards. | ||
217 | * | ||
218 | * must be called with dest->rbio_list_lock held | ||
219 | */ | ||
220 | static void merge_rbio(struct btrfs_raid_bio *dest, | ||
221 | struct btrfs_raid_bio *victim) | ||
222 | { | ||
223 | bio_list_merge(&dest->bio_list, &victim->bio_list); | ||
224 | dest->bio_list_bytes += victim->bio_list_bytes; | ||
225 | bio_list_init(&victim->bio_list); | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * free the hash table used by unmount | ||
230 | */ | ||
231 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) | ||
232 | { | ||
233 | if (!info->stripe_hash_table) | ||
234 | return; | ||
235 | kfree(info->stripe_hash_table); | ||
236 | info->stripe_hash_table = NULL; | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * helper function to run the xor_blocks api. It is only | ||
241 | * able to do MAX_XOR_BLOCKS at a time, so we need to | ||
242 | * loop through. | ||
243 | */ | ||
244 | static void run_xor(void **pages, int src_cnt, ssize_t len) | ||
245 | { | ||
246 | int src_off = 0; | ||
247 | int xor_src_cnt = 0; | ||
248 | void *dest = pages[src_cnt]; | ||
249 | |||
250 | while(src_cnt > 0) { | ||
251 | xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); | ||
252 | xor_blocks(xor_src_cnt, len, dest, pages + src_off); | ||
253 | |||
254 | src_cnt -= xor_src_cnt; | ||
255 | src_off += xor_src_cnt; | ||
256 | } | ||
257 | } | ||
258 | |||
259 | /* | ||
260 | * returns true if the bio list inside this rbio | ||
261 | * covers an entire stripe (no rmw required). | ||
262 | * Must be called with the bio list lock held, or | ||
263 | * at a time when you know it is impossible to add | ||
264 | * new bios into the list | ||
265 | */ | ||
266 | static int __rbio_is_full(struct btrfs_raid_bio *rbio) | ||
267 | { | ||
268 | unsigned long size = rbio->bio_list_bytes; | ||
269 | int ret = 1; | ||
270 | |||
271 | if (size != rbio->nr_data * rbio->stripe_len) | ||
272 | ret = 0; | ||
273 | |||
274 | BUG_ON(size > rbio->nr_data * rbio->stripe_len); | ||
275 | return ret; | ||
276 | } | ||
277 | |||
278 | static int rbio_is_full(struct btrfs_raid_bio *rbio) | ||
279 | { | ||
280 | unsigned long flags; | ||
281 | int ret; | ||
282 | |||
283 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
284 | ret = __rbio_is_full(rbio); | ||
285 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
286 | return ret; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * returns 1 if it is safe to merge two rbios together. | ||
291 | * The merging is safe if the two rbios correspond to | ||
292 | * the same stripe and if they are both going in the same | ||
293 | * direction (read vs write), and if neither one is | ||
294 | * locked for final IO | ||
295 | * | ||
296 | * The caller is responsible for locking such that | ||
297 | * rmw_locked is safe to test | ||
298 | */ | ||
299 | static int rbio_can_merge(struct btrfs_raid_bio *last, | ||
300 | struct btrfs_raid_bio *cur) | ||
301 | { | ||
302 | if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || | ||
303 | test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) | ||
304 | return 0; | ||
305 | |||
306 | if (last->raid_map[0] != | ||
307 | cur->raid_map[0]) | ||
308 | return 0; | ||
309 | |||
310 | /* reads can't merge with writes */ | ||
311 | if (last->read_rebuild != | ||
312 | cur->read_rebuild) { | ||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | return 1; | ||
317 | } | ||
318 | |||
319 | /* | ||
320 | * helper to index into the pstripe | ||
321 | */ | ||
322 | static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
323 | { | ||
324 | index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
325 | return rbio->stripe_pages[index]; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * helper to index into the qstripe, returns null | ||
330 | * if there is no qstripe | ||
331 | */ | ||
332 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
333 | { | ||
334 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | ||
335 | return NULL; | ||
336 | |||
337 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | ||
338 | PAGE_CACHE_SHIFT; | ||
339 | return rbio->stripe_pages[index]; | ||
340 | } | ||
341 | |||
342 | /* | ||
343 | * The first stripe in the table for a logical address | ||
344 | * has the lock. rbios are added in one of three ways: | ||
345 | * | ||
346 | * 1) Nobody has the stripe locked yet. The rbio is given | ||
347 | * the lock and 0 is returned. The caller must start the IO | ||
348 | * themselves. | ||
349 | * | ||
350 | * 2) Someone has the stripe locked, but we're able to merge | ||
351 | * with the lock owner. The rbio is freed and the IO will | ||
352 | * start automatically along with the existing rbio. 1 is returned. | ||
353 | * | ||
354 | * 3) Someone has the stripe locked, but we're not able to merge. | ||
355 | * The rbio is added to the lock owner's plug list, or merged into | ||
356 | * an rbio already on the plug list. When the lock owner unlocks, | ||
357 | * the next rbio on the list is run and the IO is started automatically. | ||
358 | * 1 is returned | ||
359 | * | ||
360 | * If we return 0, the caller still owns the rbio and must continue with | ||
361 | * IO submission. If we return 1, the caller must assume the rbio has | ||
362 | * already been freed. | ||
363 | */ | ||
364 | static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) | ||
365 | { | ||
366 | int bucket = rbio_bucket(rbio); | ||
367 | struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
368 | struct btrfs_raid_bio *cur; | ||
369 | struct btrfs_raid_bio *pending; | ||
370 | unsigned long flags; | ||
371 | DEFINE_WAIT(wait); | ||
372 | struct btrfs_raid_bio *freeit = NULL; | ||
373 | int ret = 0; | ||
374 | int walk = 0; | ||
375 | |||
376 | spin_lock_irqsave(&h->lock, flags); | ||
377 | list_for_each_entry(cur, &h->hash_list, hash_list) { | ||
378 | walk++; | ||
379 | if (cur->raid_map[0] == rbio->raid_map[0]) { | ||
380 | spin_lock(&cur->bio_list_lock); | ||
381 | |||
382 | /* can we merge into the lock owner? */ | ||
383 | if (rbio_can_merge(cur, rbio)) { | ||
384 | merge_rbio(cur, rbio); | ||
385 | spin_unlock(&cur->bio_list_lock); | ||
386 | freeit = rbio; | ||
387 | ret = 1; | ||
388 | goto out; | ||
389 | } | ||
390 | |||
391 | /* | ||
392 | * we couldn't merge with the running | ||
393 | * rbio, see if we can merge with the | ||
394 | * pending ones. We don't have to | ||
395 | * check for rmw_locked because there | ||
396 | * is no way they are inside finish_rmw | ||
397 | * right now | ||
398 | */ | ||
399 | list_for_each_entry(pending, &cur->plug_list, | ||
400 | plug_list) { | ||
401 | if (rbio_can_merge(pending, rbio)) { | ||
402 | merge_rbio(pending, rbio); | ||
403 | spin_unlock(&cur->bio_list_lock); | ||
404 | freeit = rbio; | ||
405 | ret = 1; | ||
406 | goto out; | ||
407 | } | ||
408 | } | ||
409 | |||
410 | /* no merging, put us on the tail of the plug list, | ||
411 | * our rbio will be started with the currently | ||
412 | * running rbio unlocks | ||
413 | */ | ||
414 | list_add_tail(&rbio->plug_list, &cur->plug_list); | ||
415 | spin_unlock(&cur->bio_list_lock); | ||
416 | ret = 1; | ||
417 | goto out; | ||
418 | } | ||
419 | } | ||
420 | |||
421 | atomic_inc(&rbio->refs); | ||
422 | list_add(&rbio->hash_list, &h->hash_list); | ||
423 | out: | ||
424 | spin_unlock_irqrestore(&h->lock, flags); | ||
425 | if (freeit) | ||
426 | __free_raid_bio(freeit); | ||
427 | return ret; | ||
428 | } | ||
429 | |||
430 | /* | ||
431 | * called as rmw or parity rebuild is completed. If the plug list has more | ||
432 | * rbios waiting for this stripe, the next one on the list will be started | ||
433 | */ | ||
434 | static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | ||
435 | { | ||
436 | int bucket; | ||
437 | struct btrfs_stripe_hash *h; | ||
438 | unsigned long flags; | ||
439 | |||
440 | bucket = rbio_bucket(rbio); | ||
441 | h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
442 | |||
443 | spin_lock_irqsave(&h->lock, flags); | ||
444 | spin_lock(&rbio->bio_list_lock); | ||
445 | |||
446 | if (!list_empty(&rbio->hash_list)) { | ||
447 | |||
448 | list_del_init(&rbio->hash_list); | ||
449 | atomic_dec(&rbio->refs); | ||
450 | |||
451 | /* | ||
452 | * we use the plug list to hold all the rbios | ||
453 | * waiting for the chance to lock this stripe. | ||
454 | * hand the lock over to one of them. | ||
455 | */ | ||
456 | if (!list_empty(&rbio->plug_list)) { | ||
457 | struct btrfs_raid_bio *next; | ||
458 | struct list_head *head = rbio->plug_list.next; | ||
459 | |||
460 | next = list_entry(head, struct btrfs_raid_bio, | ||
461 | plug_list); | ||
462 | |||
463 | list_del_init(&rbio->plug_list); | ||
464 | |||
465 | list_add(&next->hash_list, &h->hash_list); | ||
466 | atomic_inc(&next->refs); | ||
467 | spin_unlock(&rbio->bio_list_lock); | ||
468 | spin_unlock_irqrestore(&h->lock, flags); | ||
469 | |||
470 | if (next->read_rebuild) | ||
471 | async_read_rebuild(next); | ||
472 | else | ||
473 | async_rmw_stripe(next); | ||
474 | |||
475 | goto done_nolock; | ||
476 | |||
477 | } else if (waitqueue_active(&h->wait)) { | ||
478 | spin_unlock(&rbio->bio_list_lock); | ||
479 | spin_unlock_irqrestore(&h->lock, flags); | ||
480 | wake_up(&h->wait); | ||
481 | goto done_nolock; | ||
482 | } | ||
483 | } | ||
484 | spin_unlock(&rbio->bio_list_lock); | ||
485 | spin_unlock_irqrestore(&h->lock, flags); | ||
486 | |||
487 | done_nolock: | ||
488 | return; | ||
489 | } | ||
490 | |||
491 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | ||
492 | { | ||
493 | int i; | ||
494 | |||
495 | WARN_ON(atomic_read(&rbio->refs) < 0); | ||
496 | if (!atomic_dec_and_test(&rbio->refs)) | ||
497 | return; | ||
498 | |||
499 | WARN_ON(!list_empty(&rbio->hash_list)); | ||
500 | WARN_ON(!bio_list_empty(&rbio->bio_list)); | ||
501 | |||
502 | for (i = 0; i < rbio->nr_pages; i++) { | ||
503 | if (rbio->stripe_pages[i]) { | ||
504 | __free_page(rbio->stripe_pages[i]); | ||
505 | rbio->stripe_pages[i] = NULL; | ||
506 | } | ||
507 | } | ||
508 | kfree(rbio->raid_map); | ||
509 | kfree(rbio->bbio); | ||
510 | kfree(rbio); | ||
511 | } | ||
512 | |||
513 | static void free_raid_bio(struct btrfs_raid_bio *rbio) | ||
514 | { | ||
515 | unlock_stripe(rbio); | ||
516 | __free_raid_bio(rbio); | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * this frees the rbio and runs through all the bios in the | ||
521 | * bio_list and calls end_io on them | ||
522 | */ | ||
523 | static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | ||
524 | { | ||
525 | struct bio *cur = bio_list_get(&rbio->bio_list); | ||
526 | struct bio *next; | ||
527 | free_raid_bio(rbio); | ||
528 | |||
529 | while (cur) { | ||
530 | next = cur->bi_next; | ||
531 | cur->bi_next = NULL; | ||
532 | if (uptodate) | ||
533 | set_bit(BIO_UPTODATE, &cur->bi_flags); | ||
534 | bio_endio(cur, err); | ||
535 | cur = next; | ||
536 | } | ||
537 | } | ||
538 | |||
539 | /* | ||
540 | * end io function used by finish_rmw. When we finally | ||
541 | * get here, we've written a full stripe | ||
542 | */ | ||
543 | static void raid_write_end_io(struct bio *bio, int err) | ||
544 | { | ||
545 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
546 | |||
547 | if (err) | ||
548 | fail_bio_stripe(rbio, bio); | ||
549 | |||
550 | bio_put(bio); | ||
551 | |||
552 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
553 | return; | ||
554 | |||
555 | err = 0; | ||
556 | |||
557 | /* OK, we have read all the stripes we need to. */ | ||
558 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
559 | err = -EIO; | ||
560 | |||
561 | rbio_orig_end_io(rbio, err, 0); | ||
562 | return; | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * the read/modify/write code wants to use the original bio for | ||
567 | * any pages it included, and then use the rbio for everything | ||
568 | * else. This function decides if a given index (stripe number) | ||
569 | * and page number in that stripe fall inside the original bio | ||
570 | * or the rbio. | ||
571 | * | ||
572 | * if you set bio_list_only, you'll get a NULL back for any ranges | ||
573 | * that are outside the bio_list | ||
574 | * | ||
575 | * This doesn't take any refs on anything, you get a bare page pointer | ||
576 | * and the caller must bump refs as required. | ||
577 | * | ||
578 | * You must call index_rbio_pages once before you can trust | ||
579 | * the answers from this function. | ||
580 | */ | ||
581 | static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, | ||
582 | int index, int pagenr, int bio_list_only) | ||
583 | { | ||
584 | int chunk_page; | ||
585 | struct page *p = NULL; | ||
586 | |||
587 | chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; | ||
588 | |||
589 | spin_lock_irq(&rbio->bio_list_lock); | ||
590 | p = rbio->bio_pages[chunk_page]; | ||
591 | spin_unlock_irq(&rbio->bio_list_lock); | ||
592 | |||
593 | if (p || bio_list_only) | ||
594 | return p; | ||
595 | |||
596 | return rbio->stripe_pages[chunk_page]; | ||
597 | } | ||
598 | |||
599 | /* | ||
600 | * number of pages we need for the entire stripe across all the | ||
601 | * drives | ||
602 | */ | ||
603 | static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) | ||
604 | { | ||
605 | unsigned long nr = stripe_len * nr_stripes; | ||
606 | return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
607 | } | ||
608 | |||
609 | /* | ||
610 | * allocation and initial setup for the btrfs_raid_bio. Not | ||
611 | * this does not allocate any pages for rbio->pages. | ||
612 | */ | ||
613 | static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | ||
614 | struct btrfs_bio *bbio, u64 *raid_map, | ||
615 | u64 stripe_len) | ||
616 | { | ||
617 | struct btrfs_raid_bio *rbio; | ||
618 | int nr_data = 0; | ||
619 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | ||
620 | void *p; | ||
621 | |||
622 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | ||
623 | GFP_NOFS); | ||
624 | if (!rbio) { | ||
625 | kfree(raid_map); | ||
626 | kfree(bbio); | ||
627 | return ERR_PTR(-ENOMEM); | ||
628 | } | ||
629 | |||
630 | bio_list_init(&rbio->bio_list); | ||
631 | INIT_LIST_HEAD(&rbio->plug_list); | ||
632 | spin_lock_init(&rbio->bio_list_lock); | ||
633 | INIT_LIST_HEAD(&rbio->hash_list); | ||
634 | rbio->bbio = bbio; | ||
635 | rbio->raid_map = raid_map; | ||
636 | rbio->fs_info = root->fs_info; | ||
637 | rbio->stripe_len = stripe_len; | ||
638 | rbio->nr_pages = num_pages; | ||
639 | rbio->faila = -1; | ||
640 | rbio->failb = -1; | ||
641 | atomic_set(&rbio->refs, 1); | ||
642 | |||
643 | /* | ||
644 | * the stripe_pages and bio_pages array point to the extra | ||
645 | * memory we allocated past the end of the rbio | ||
646 | */ | ||
647 | p = rbio + 1; | ||
648 | rbio->stripe_pages = p; | ||
649 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | ||
650 | |||
651 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | ||
652 | nr_data = bbio->num_stripes - 2; | ||
653 | else | ||
654 | nr_data = bbio->num_stripes - 1; | ||
655 | |||
656 | rbio->nr_data = nr_data; | ||
657 | return rbio; | ||
658 | } | ||
659 | |||
660 | /* allocate pages for all the stripes in the bio, including parity */ | ||
661 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) | ||
662 | { | ||
663 | int i; | ||
664 | struct page *page; | ||
665 | |||
666 | for (i = 0; i < rbio->nr_pages; i++) { | ||
667 | if (rbio->stripe_pages[i]) | ||
668 | continue; | ||
669 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
670 | if (!page) | ||
671 | return -ENOMEM; | ||
672 | rbio->stripe_pages[i] = page; | ||
673 | ClearPageUptodate(page); | ||
674 | } | ||
675 | return 0; | ||
676 | } | ||
677 | |||
678 | /* allocate pages for just the p/q stripes */ | ||
679 | static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | ||
680 | { | ||
681 | int i; | ||
682 | struct page *page; | ||
683 | |||
684 | i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
685 | |||
686 | for (; i < rbio->nr_pages; i++) { | ||
687 | if (rbio->stripe_pages[i]) | ||
688 | continue; | ||
689 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
690 | if (!page) | ||
691 | return -ENOMEM; | ||
692 | rbio->stripe_pages[i] = page; | ||
693 | } | ||
694 | return 0; | ||
695 | } | ||
696 | |||
697 | /* | ||
698 | * add a single page from a specific stripe into our list of bios for IO | ||
699 | * this will try to merge into existing bios if possible, and returns | ||
700 | * zero if all went well. | ||
701 | */ | ||
702 | int rbio_add_io_page(struct btrfs_raid_bio *rbio, | ||
703 | struct bio_list *bio_list, | ||
704 | struct page *page, | ||
705 | int stripe_nr, | ||
706 | unsigned long page_index, | ||
707 | unsigned long bio_max_len) | ||
708 | { | ||
709 | struct bio *last = bio_list->tail; | ||
710 | u64 last_end = 0; | ||
711 | int ret; | ||
712 | struct bio *bio; | ||
713 | struct btrfs_bio_stripe *stripe; | ||
714 | u64 disk_start; | ||
715 | |||
716 | stripe = &rbio->bbio->stripes[stripe_nr]; | ||
717 | disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); | ||
718 | |||
719 | /* if the device is missing, just fail this stripe */ | ||
720 | if (!stripe->dev->bdev) | ||
721 | return fail_rbio_index(rbio, stripe_nr); | ||
722 | |||
723 | /* see if we can add this page onto our existing bio */ | ||
724 | if (last) { | ||
725 | last_end = (u64)last->bi_sector << 9; | ||
726 | last_end += last->bi_size; | ||
727 | |||
728 | /* | ||
729 | * we can't merge these if they are from different | ||
730 | * devices or if they are not contiguous | ||
731 | */ | ||
732 | if (last_end == disk_start && stripe->dev->bdev && | ||
733 | test_bit(BIO_UPTODATE, &last->bi_flags) && | ||
734 | last->bi_bdev == stripe->dev->bdev) { | ||
735 | ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); | ||
736 | if (ret == PAGE_CACHE_SIZE) | ||
737 | return 0; | ||
738 | } | ||
739 | } | ||
740 | |||
741 | /* put a new bio on the list */ | ||
742 | bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); | ||
743 | if (!bio) | ||
744 | return -ENOMEM; | ||
745 | |||
746 | bio->bi_size = 0; | ||
747 | bio->bi_bdev = stripe->dev->bdev; | ||
748 | bio->bi_sector = disk_start >> 9; | ||
749 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
750 | |||
751 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
752 | bio_list_add(bio_list, bio); | ||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * while we're doing the read/modify/write cycle, we could | ||
758 | * have errors in reading pages off the disk. This checks | ||
759 | * for errors and if we're not able to read the page it'll | ||
760 | * trigger parity reconstruction. The rmw will be finished | ||
761 | * after we've reconstructed the failed stripes | ||
762 | */ | ||
763 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | ||
764 | { | ||
765 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
766 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | ||
767 | __raid56_parity_recover(rbio); | ||
768 | } else { | ||
769 | finish_rmw(rbio); | ||
770 | } | ||
771 | } | ||
772 | |||
773 | /* | ||
774 | * these are just the pages from the rbio array, not from anything | ||
775 | * the FS sent down to us | ||
776 | */ | ||
777 | static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) | ||
778 | { | ||
779 | int index; | ||
780 | index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); | ||
781 | index += page; | ||
782 | return rbio->stripe_pages[index]; | ||
783 | } | ||
784 | |||
785 | /* | ||
786 | * helper function to walk our bio list and populate the bio_pages array with | ||
787 | * the result. This seems expensive, but it is faster than constantly | ||
788 | * searching through the bio list as we setup the IO in finish_rmw or stripe | ||
789 | * reconstruction. | ||
790 | * | ||
791 | * This must be called before you trust the answers from page_in_rbio | ||
792 | */ | ||
793 | static void index_rbio_pages(struct btrfs_raid_bio *rbio) | ||
794 | { | ||
795 | struct bio *bio; | ||
796 | u64 start; | ||
797 | unsigned long stripe_offset; | ||
798 | unsigned long page_index; | ||
799 | struct page *p; | ||
800 | int i; | ||
801 | |||
802 | spin_lock_irq(&rbio->bio_list_lock); | ||
803 | bio_list_for_each(bio, &rbio->bio_list) { | ||
804 | start = (u64)bio->bi_sector << 9; | ||
805 | stripe_offset = start - rbio->raid_map[0]; | ||
806 | page_index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
807 | |||
808 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
809 | p = bio->bi_io_vec[i].bv_page; | ||
810 | rbio->bio_pages[page_index + i] = p; | ||
811 | } | ||
812 | } | ||
813 | spin_unlock_irq(&rbio->bio_list_lock); | ||
814 | } | ||
815 | |||
816 | /* | ||
817 | * this is called from one of two situations. We either | ||
818 | * have a full stripe from the higher layers, or we've read all | ||
819 | * the missing bits off disk. | ||
820 | * | ||
821 | * This will calculate the parity and then send down any | ||
822 | * changed blocks. | ||
823 | */ | ||
824 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | ||
825 | { | ||
826 | struct btrfs_bio *bbio = rbio->bbio; | ||
827 | void *pointers[bbio->num_stripes]; | ||
828 | int stripe_len = rbio->stripe_len; | ||
829 | int nr_data = rbio->nr_data; | ||
830 | int stripe; | ||
831 | int pagenr; | ||
832 | int p_stripe = -1; | ||
833 | int q_stripe = -1; | ||
834 | struct bio_list bio_list; | ||
835 | struct bio *bio; | ||
836 | int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; | ||
837 | int ret; | ||
838 | |||
839 | bio_list_init(&bio_list); | ||
840 | |||
841 | if (bbio->num_stripes - rbio->nr_data == 1) { | ||
842 | p_stripe = bbio->num_stripes - 1; | ||
843 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | ||
844 | p_stripe = bbio->num_stripes - 2; | ||
845 | q_stripe = bbio->num_stripes - 1; | ||
846 | } else { | ||
847 | BUG(); | ||
848 | } | ||
849 | |||
850 | /* at this point we either have a full stripe, | ||
851 | * or we've read the full stripe from the drive. | ||
852 | * recalculate the parity and write the new results. | ||
853 | * | ||
854 | * We're not allowed to add any new bios to the | ||
855 | * bio list here, anyone else that wants to | ||
856 | * change this stripe needs to do their own rmw. | ||
857 | */ | ||
858 | spin_lock_irq(&rbio->bio_list_lock); | ||
859 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
860 | spin_unlock_irq(&rbio->bio_list_lock); | ||
861 | |||
862 | atomic_set(&rbio->bbio->error, 0); | ||
863 | |||
864 | /* | ||
865 | * now that we've set rmw_locked, run through the | ||
866 | * bio list one last time and map the page pointers | ||
867 | */ | ||
868 | index_rbio_pages(rbio); | ||
869 | |||
870 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
871 | struct page *p; | ||
872 | /* first collect one page from each data stripe */ | ||
873 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
874 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
875 | pointers[stripe] = kmap(p); | ||
876 | } | ||
877 | |||
878 | /* then add the parity stripe */ | ||
879 | p = rbio_pstripe_page(rbio, pagenr); | ||
880 | SetPageUptodate(p); | ||
881 | pointers[stripe++] = kmap(p); | ||
882 | |||
883 | if (q_stripe != -1) { | ||
884 | |||
885 | /* | ||
886 | * raid6, add the qstripe and call the | ||
887 | * library function to fill in our p/q | ||
888 | */ | ||
889 | p = rbio_qstripe_page(rbio, pagenr); | ||
890 | SetPageUptodate(p); | ||
891 | pointers[stripe++] = kmap(p); | ||
892 | |||
893 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | ||
894 | pointers); | ||
895 | } else { | ||
896 | /* raid5 */ | ||
897 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
898 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
899 | } | ||
900 | |||
901 | |||
902 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | ||
903 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
904 | } | ||
905 | |||
906 | /* | ||
907 | * time to start writing. Make bios for everything from the | ||
908 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
909 | * everything else. | ||
910 | */ | ||
911 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
912 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
913 | struct page *page; | ||
914 | if (stripe < rbio->nr_data) { | ||
915 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
916 | if (!page) | ||
917 | continue; | ||
918 | } else { | ||
919 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
920 | } | ||
921 | |||
922 | ret = rbio_add_io_page(rbio, &bio_list, | ||
923 | page, stripe, pagenr, rbio->stripe_len); | ||
924 | if (ret) | ||
925 | goto cleanup; | ||
926 | } | ||
927 | } | ||
928 | |||
929 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | ||
930 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | ||
931 | |||
932 | while (1) { | ||
933 | bio = bio_list_pop(&bio_list); | ||
934 | if (!bio) | ||
935 | break; | ||
936 | |||
937 | bio->bi_private = rbio; | ||
938 | bio->bi_end_io = raid_write_end_io; | ||
939 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
940 | submit_bio(WRITE, bio); | ||
941 | } | ||
942 | return; | ||
943 | |||
944 | cleanup: | ||
945 | rbio_orig_end_io(rbio, -EIO, 0); | ||
946 | } | ||
947 | |||
948 | /* | ||
949 | * helper to find the stripe number for a given bio. Used to figure out which | ||
950 | * stripe has failed. This expects the bio to correspond to a physical disk, | ||
951 | * so it looks up based on physical sector numbers. | ||
952 | */ | ||
953 | static int find_bio_stripe(struct btrfs_raid_bio *rbio, | ||
954 | struct bio *bio) | ||
955 | { | ||
956 | u64 physical = bio->bi_sector; | ||
957 | u64 stripe_start; | ||
958 | int i; | ||
959 | struct btrfs_bio_stripe *stripe; | ||
960 | |||
961 | physical <<= 9; | ||
962 | |||
963 | for (i = 0; i < rbio->bbio->num_stripes; i++) { | ||
964 | stripe = &rbio->bbio->stripes[i]; | ||
965 | stripe_start = stripe->physical; | ||
966 | if (physical >= stripe_start && | ||
967 | physical < stripe_start + rbio->stripe_len) { | ||
968 | return i; | ||
969 | } | ||
970 | } | ||
971 | return -1; | ||
972 | } | ||
973 | |||
974 | /* | ||
975 | * helper to find the stripe number for a given | ||
976 | * bio (before mapping). Used to figure out which stripe has | ||
977 | * failed. This looks up based on logical block numbers. | ||
978 | */ | ||
979 | static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, | ||
980 | struct bio *bio) | ||
981 | { | ||
982 | u64 logical = bio->bi_sector; | ||
983 | u64 stripe_start; | ||
984 | int i; | ||
985 | |||
986 | logical <<= 9; | ||
987 | |||
988 | for (i = 0; i < rbio->nr_data; i++) { | ||
989 | stripe_start = rbio->raid_map[i]; | ||
990 | if (logical >= stripe_start && | ||
991 | logical < stripe_start + rbio->stripe_len) { | ||
992 | return i; | ||
993 | } | ||
994 | } | ||
995 | return -1; | ||
996 | } | ||
997 | |||
998 | /* | ||
999 | * returns -EIO if we had too many failures | ||
1000 | */ | ||
1001 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | ||
1002 | { | ||
1003 | unsigned long flags; | ||
1004 | int ret = 0; | ||
1005 | |||
1006 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
1007 | |||
1008 | /* we already know this stripe is bad, move on */ | ||
1009 | if (rbio->faila == failed || rbio->failb == failed) | ||
1010 | goto out; | ||
1011 | |||
1012 | if (rbio->faila == -1) { | ||
1013 | /* first failure on this rbio */ | ||
1014 | rbio->faila = failed; | ||
1015 | atomic_inc(&rbio->bbio->error); | ||
1016 | } else if (rbio->failb == -1) { | ||
1017 | /* second failure on this rbio */ | ||
1018 | rbio->failb = failed; | ||
1019 | atomic_inc(&rbio->bbio->error); | ||
1020 | } else { | ||
1021 | ret = -EIO; | ||
1022 | } | ||
1023 | out: | ||
1024 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
1025 | |||
1026 | return ret; | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
1030 | * helper to fail a stripe based on a physical disk | ||
1031 | * bio. | ||
1032 | */ | ||
1033 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1034 | struct bio *bio) | ||
1035 | { | ||
1036 | int failed = find_bio_stripe(rbio, bio); | ||
1037 | |||
1038 | if (failed < 0) | ||
1039 | return -EIO; | ||
1040 | |||
1041 | return fail_rbio_index(rbio, failed); | ||
1042 | } | ||
1043 | |||
1044 | /* | ||
1045 | * this sets each page in the bio uptodate. It should only be used on private | ||
1046 | * rbio pages, nothing that comes in from the higher layers | ||
1047 | */ | ||
1048 | static void set_bio_pages_uptodate(struct bio *bio) | ||
1049 | { | ||
1050 | int i; | ||
1051 | struct page *p; | ||
1052 | |||
1053 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
1054 | p = bio->bi_io_vec[i].bv_page; | ||
1055 | SetPageUptodate(p); | ||
1056 | } | ||
1057 | } | ||
1058 | |||
1059 | /* | ||
1060 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
1061 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
1062 | * stripe. | ||
1063 | * | ||
1064 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
1065 | * may trigger parity reconstruction if we had any errors along the way | ||
1066 | */ | ||
1067 | static void raid_rmw_end_io(struct bio *bio, int err) | ||
1068 | { | ||
1069 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1070 | |||
1071 | if (err) | ||
1072 | fail_bio_stripe(rbio, bio); | ||
1073 | else | ||
1074 | set_bio_pages_uptodate(bio); | ||
1075 | |||
1076 | bio_put(bio); | ||
1077 | |||
1078 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1079 | return; | ||
1080 | |||
1081 | err = 0; | ||
1082 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1083 | goto cleanup; | ||
1084 | |||
1085 | /* | ||
1086 | * this will normally call finish_rmw to start our write | ||
1087 | * but if there are any failed stripes we'll reconstruct | ||
1088 | * from parity first | ||
1089 | */ | ||
1090 | validate_rbio_for_rmw(rbio); | ||
1091 | return; | ||
1092 | |||
1093 | cleanup: | ||
1094 | |||
1095 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1096 | } | ||
1097 | |||
1098 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1099 | { | ||
1100 | rbio->work.flags = 0; | ||
1101 | rbio->work.func = rmw_work; | ||
1102 | |||
1103 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1104 | &rbio->work); | ||
1105 | } | ||
1106 | |||
1107 | static void async_read_rebuild(struct btrfs_raid_bio *rbio) | ||
1108 | { | ||
1109 | rbio->work.flags = 0; | ||
1110 | rbio->work.func = read_rebuild_work; | ||
1111 | |||
1112 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1113 | &rbio->work); | ||
1114 | } | ||
1115 | |||
1116 | /* | ||
1117 | * the stripe must be locked by the caller. It will | ||
1118 | * unlock after all the writes are done | ||
1119 | */ | ||
1120 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1121 | { | ||
1122 | int bios_to_read = 0; | ||
1123 | struct btrfs_bio *bbio = rbio->bbio; | ||
1124 | struct bio_list bio_list; | ||
1125 | int ret; | ||
1126 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1127 | int pagenr; | ||
1128 | int stripe; | ||
1129 | struct bio *bio; | ||
1130 | |||
1131 | bio_list_init(&bio_list); | ||
1132 | |||
1133 | ret = alloc_rbio_pages(rbio); | ||
1134 | if (ret) | ||
1135 | goto cleanup; | ||
1136 | |||
1137 | index_rbio_pages(rbio); | ||
1138 | |||
1139 | atomic_set(&rbio->bbio->error, 0); | ||
1140 | /* | ||
1141 | * build a list of bios to read all the missing parts of this | ||
1142 | * stripe | ||
1143 | */ | ||
1144 | for (stripe = 0; stripe < rbio->nr_data; stripe++) { | ||
1145 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1146 | struct page *page; | ||
1147 | /* | ||
1148 | * we want to find all the pages missing from | ||
1149 | * the rbio and read them from the disk. If | ||
1150 | * page_in_rbio finds a page in the bio list | ||
1151 | * we don't need to read it off the stripe. | ||
1152 | */ | ||
1153 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1154 | if (page) | ||
1155 | continue; | ||
1156 | |||
1157 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1158 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
1159 | stripe, pagenr, rbio->stripe_len); | ||
1160 | if (ret) | ||
1161 | goto cleanup; | ||
1162 | } | ||
1163 | } | ||
1164 | |||
1165 | bios_to_read = bio_list_size(&bio_list); | ||
1166 | if (!bios_to_read) { | ||
1167 | /* | ||
1168 | * this can happen if others have merged with | ||
1169 | * us, it means there is nothing left to read. | ||
1170 | * But if there are missing devices it may not be | ||
1171 | * safe to do the full stripe write yet. | ||
1172 | */ | ||
1173 | goto finish; | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * the bbio may be freed once we submit the last bio. Make sure | ||
1178 | * not to touch it after that | ||
1179 | */ | ||
1180 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
1181 | while (1) { | ||
1182 | bio = bio_list_pop(&bio_list); | ||
1183 | if (!bio) | ||
1184 | break; | ||
1185 | |||
1186 | bio->bi_private = rbio; | ||
1187 | bio->bi_end_io = raid_rmw_end_io; | ||
1188 | |||
1189 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
1190 | BTRFS_WQ_ENDIO_RAID56); | ||
1191 | |||
1192 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1193 | submit_bio(READ, bio); | ||
1194 | } | ||
1195 | /* the actual write will happen once the reads are done */ | ||
1196 | return 0; | ||
1197 | |||
1198 | cleanup: | ||
1199 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1200 | return -EIO; | ||
1201 | |||
1202 | finish: | ||
1203 | validate_rbio_for_rmw(rbio); | ||
1204 | return 0; | ||
1205 | } | ||
1206 | |||
1207 | /* | ||
1208 | * if the upper layers pass in a full stripe, we thank them by only allocating | ||
1209 | * enough pages to hold the parity, and sending it all down quickly. | ||
1210 | */ | ||
1211 | static int full_stripe_write(struct btrfs_raid_bio *rbio) | ||
1212 | { | ||
1213 | int ret; | ||
1214 | |||
1215 | ret = alloc_rbio_parity_pages(rbio); | ||
1216 | if (ret) | ||
1217 | return ret; | ||
1218 | |||
1219 | ret = lock_stripe_add(rbio); | ||
1220 | if (ret == 0) | ||
1221 | finish_rmw(rbio); | ||
1222 | return 0; | ||
1223 | } | ||
1224 | |||
1225 | /* | ||
1226 | * partial stripe writes get handed over to async helpers. | ||
1227 | * We're really hoping to merge a few more writes into this | ||
1228 | * rbio before calculating new parity | ||
1229 | */ | ||
1230 | static int partial_stripe_write(struct btrfs_raid_bio *rbio) | ||
1231 | { | ||
1232 | int ret; | ||
1233 | |||
1234 | ret = lock_stripe_add(rbio); | ||
1235 | if (ret == 0) | ||
1236 | async_rmw_stripe(rbio); | ||
1237 | return 0; | ||
1238 | } | ||
1239 | |||
1240 | /* | ||
1241 | * sometimes while we were reading from the drive to | ||
1242 | * recalculate parity, enough new bios come into create | ||
1243 | * a full stripe. So we do a check here to see if we can | ||
1244 | * go directly to finish_rmw | ||
1245 | */ | ||
1246 | static int __raid56_parity_write(struct btrfs_raid_bio *rbio) | ||
1247 | { | ||
1248 | /* head off into rmw land if we don't have a full stripe */ | ||
1249 | if (!rbio_is_full(rbio)) | ||
1250 | return partial_stripe_write(rbio); | ||
1251 | return full_stripe_write(rbio); | ||
1252 | } | ||
1253 | |||
1254 | /* | ||
1255 | * our main entry point for writes from the rest of the FS. | ||
1256 | */ | ||
1257 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
1258 | struct btrfs_bio *bbio, u64 *raid_map, | ||
1259 | u64 stripe_len) | ||
1260 | { | ||
1261 | struct btrfs_raid_bio *rbio; | ||
1262 | |||
1263 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
1264 | if (IS_ERR(rbio)) { | ||
1265 | kfree(raid_map); | ||
1266 | kfree(bbio); | ||
1267 | return PTR_ERR(rbio); | ||
1268 | } | ||
1269 | bio_list_add(&rbio->bio_list, bio); | ||
1270 | rbio->bio_list_bytes = bio->bi_size; | ||
1271 | return __raid56_parity_write(rbio); | ||
1272 | } | ||
1273 | |||
1274 | /* | ||
1275 | * all parity reconstruction happens here. We've read in everything | ||
1276 | * we can find from the drives and this does the heavy lifting of | ||
1277 | * sorting the good from the bad. | ||
1278 | */ | ||
1279 | static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | ||
1280 | { | ||
1281 | int pagenr, stripe; | ||
1282 | void **pointers; | ||
1283 | int faila = -1, failb = -1; | ||
1284 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1285 | struct page *page; | ||
1286 | int err; | ||
1287 | int i; | ||
1288 | |||
1289 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | ||
1290 | GFP_NOFS); | ||
1291 | if (!pointers) { | ||
1292 | err = -ENOMEM; | ||
1293 | goto cleanup_io; | ||
1294 | } | ||
1295 | |||
1296 | faila = rbio->faila; | ||
1297 | failb = rbio->failb; | ||
1298 | |||
1299 | if (rbio->read_rebuild) { | ||
1300 | spin_lock_irq(&rbio->bio_list_lock); | ||
1301 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
1302 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1303 | } | ||
1304 | |||
1305 | index_rbio_pages(rbio); | ||
1306 | |||
1307 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1308 | /* setup our array of pointers with pages | ||
1309 | * from each stripe | ||
1310 | */ | ||
1311 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1312 | /* | ||
1313 | * if we're rebuilding a read, we have to use | ||
1314 | * pages from the bio list | ||
1315 | */ | ||
1316 | if (rbio->read_rebuild && | ||
1317 | (stripe == faila || stripe == failb)) { | ||
1318 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1319 | } else { | ||
1320 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1321 | } | ||
1322 | pointers[stripe] = kmap(page); | ||
1323 | } | ||
1324 | |||
1325 | /* all raid6 handling here */ | ||
1326 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | ||
1327 | RAID6_Q_STRIPE) { | ||
1328 | |||
1329 | /* | ||
1330 | * single failure, rebuild from parity raid5 | ||
1331 | * style | ||
1332 | */ | ||
1333 | if (failb < 0) { | ||
1334 | if (faila == rbio->nr_data) { | ||
1335 | /* | ||
1336 | * Just the P stripe has failed, without | ||
1337 | * a bad data or Q stripe. | ||
1338 | * TODO, we should redo the xor here. | ||
1339 | */ | ||
1340 | err = -EIO; | ||
1341 | goto cleanup; | ||
1342 | } | ||
1343 | /* | ||
1344 | * a single failure in raid6 is rebuilt | ||
1345 | * in the pstripe code below | ||
1346 | */ | ||
1347 | goto pstripe; | ||
1348 | } | ||
1349 | |||
1350 | /* make sure our ps and qs are in order */ | ||
1351 | if (faila > failb) { | ||
1352 | int tmp = failb; | ||
1353 | failb = faila; | ||
1354 | faila = tmp; | ||
1355 | } | ||
1356 | |||
1357 | /* if the q stripe is failed, do a pstripe reconstruction | ||
1358 | * from the xors. | ||
1359 | * If both the q stripe and the P stripe are failed, we're | ||
1360 | * here due to a crc mismatch and we can't give them the | ||
1361 | * data they want | ||
1362 | */ | ||
1363 | if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { | ||
1364 | if (rbio->raid_map[faila] == RAID5_P_STRIPE) { | ||
1365 | err = -EIO; | ||
1366 | goto cleanup; | ||
1367 | } | ||
1368 | /* | ||
1369 | * otherwise we have one bad data stripe and | ||
1370 | * a good P stripe. raid5! | ||
1371 | */ | ||
1372 | goto pstripe; | ||
1373 | } | ||
1374 | |||
1375 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | ||
1376 | raid6_datap_recov(rbio->bbio->num_stripes, | ||
1377 | PAGE_SIZE, faila, pointers); | ||
1378 | } else { | ||
1379 | raid6_2data_recov(rbio->bbio->num_stripes, | ||
1380 | PAGE_SIZE, faila, failb, | ||
1381 | pointers); | ||
1382 | } | ||
1383 | } else { | ||
1384 | void *p; | ||
1385 | |||
1386 | /* rebuild from P stripe here (raid5 or raid6) */ | ||
1387 | BUG_ON(failb != -1); | ||
1388 | pstripe: | ||
1389 | /* Copy parity block into failed block to start with */ | ||
1390 | memcpy(pointers[faila], | ||
1391 | pointers[rbio->nr_data], | ||
1392 | PAGE_CACHE_SIZE); | ||
1393 | |||
1394 | /* rearrange the pointer array */ | ||
1395 | p = pointers[faila]; | ||
1396 | for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) | ||
1397 | pointers[stripe] = pointers[stripe + 1]; | ||
1398 | pointers[rbio->nr_data - 1] = p; | ||
1399 | |||
1400 | /* xor in the rest */ | ||
1401 | run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); | ||
1402 | } | ||
1403 | /* if we're doing this rebuild as part of an rmw, go through | ||
1404 | * and set all of our private rbio pages in the | ||
1405 | * failed stripes as uptodate. This way finish_rmw will | ||
1406 | * know they can be trusted. If this was a read reconstruction, | ||
1407 | * other endio functions will fiddle the uptodate bits | ||
1408 | */ | ||
1409 | if (!rbio->read_rebuild) { | ||
1410 | for (i = 0; i < nr_pages; i++) { | ||
1411 | if (faila != -1) { | ||
1412 | page = rbio_stripe_page(rbio, faila, i); | ||
1413 | SetPageUptodate(page); | ||
1414 | } | ||
1415 | if (failb != -1) { | ||
1416 | page = rbio_stripe_page(rbio, failb, i); | ||
1417 | SetPageUptodate(page); | ||
1418 | } | ||
1419 | } | ||
1420 | } | ||
1421 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1422 | /* | ||
1423 | * if we're rebuilding a read, we have to use | ||
1424 | * pages from the bio list | ||
1425 | */ | ||
1426 | if (rbio->read_rebuild && | ||
1427 | (stripe == faila || stripe == failb)) { | ||
1428 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1429 | } else { | ||
1430 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1431 | } | ||
1432 | kunmap(page); | ||
1433 | } | ||
1434 | } | ||
1435 | |||
1436 | err = 0; | ||
1437 | cleanup: | ||
1438 | kfree(pointers); | ||
1439 | |||
1440 | cleanup_io: | ||
1441 | |||
1442 | if (rbio->read_rebuild) { | ||
1443 | rbio_orig_end_io(rbio, err, err == 0); | ||
1444 | } else if (err == 0) { | ||
1445 | rbio->faila = -1; | ||
1446 | rbio->failb = -1; | ||
1447 | finish_rmw(rbio); | ||
1448 | } else { | ||
1449 | rbio_orig_end_io(rbio, err, 0); | ||
1450 | } | ||
1451 | } | ||
1452 | |||
1453 | /* | ||
1454 | * This is called only for stripes we've read from disk to | ||
1455 | * reconstruct the parity. | ||
1456 | */ | ||
1457 | static void raid_recover_end_io(struct bio *bio, int err) | ||
1458 | { | ||
1459 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1460 | |||
1461 | /* | ||
1462 | * we only read stripe pages off the disk, set them | ||
1463 | * up to date if there were no errors | ||
1464 | */ | ||
1465 | if (err) | ||
1466 | fail_bio_stripe(rbio, bio); | ||
1467 | else | ||
1468 | set_bio_pages_uptodate(bio); | ||
1469 | bio_put(bio); | ||
1470 | |||
1471 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1472 | return; | ||
1473 | |||
1474 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1475 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1476 | else | ||
1477 | __raid_recover_end_io(rbio); | ||
1478 | } | ||
1479 | |||
1480 | /* | ||
1481 | * reads everything we need off the disk to reconstruct | ||
1482 | * the parity. endio handlers trigger final reconstruction | ||
1483 | * when the IO is done. | ||
1484 | * | ||
1485 | * This is used both for reads from the higher layers and for | ||
1486 | * parity construction required to finish a rmw cycle. | ||
1487 | */ | ||
1488 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | ||
1489 | { | ||
1490 | int bios_to_read = 0; | ||
1491 | struct btrfs_bio *bbio = rbio->bbio; | ||
1492 | struct bio_list bio_list; | ||
1493 | int ret; | ||
1494 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1495 | int pagenr; | ||
1496 | int stripe; | ||
1497 | struct bio *bio; | ||
1498 | |||
1499 | bio_list_init(&bio_list); | ||
1500 | |||
1501 | ret = alloc_rbio_pages(rbio); | ||
1502 | if (ret) | ||
1503 | goto cleanup; | ||
1504 | |||
1505 | atomic_set(&rbio->bbio->error, 0); | ||
1506 | |||
1507 | /* | ||
1508 | * read everything that hasn't failed. | ||
1509 | */ | ||
1510 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
1511 | if (rbio->faila == stripe || | ||
1512 | rbio->failb == stripe) | ||
1513 | continue; | ||
1514 | |||
1515 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1516 | struct page *p; | ||
1517 | |||
1518 | /* | ||
1519 | * the rmw code may have already read this | ||
1520 | * page in | ||
1521 | */ | ||
1522 | p = rbio_stripe_page(rbio, stripe, pagenr); | ||
1523 | if (PageUptodate(p)) | ||
1524 | continue; | ||
1525 | |||
1526 | ret = rbio_add_io_page(rbio, &bio_list, | ||
1527 | rbio_stripe_page(rbio, stripe, pagenr), | ||
1528 | stripe, pagenr, rbio->stripe_len); | ||
1529 | if (ret < 0) | ||
1530 | goto cleanup; | ||
1531 | } | ||
1532 | } | ||
1533 | |||
1534 | bios_to_read = bio_list_size(&bio_list); | ||
1535 | if (!bios_to_read) { | ||
1536 | /* | ||
1537 | * we might have no bios to read just because the pages | ||
1538 | * were up to date, or we might have no bios to read because | ||
1539 | * the devices were gone. | ||
1540 | */ | ||
1541 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | ||
1542 | __raid_recover_end_io(rbio); | ||
1543 | goto out; | ||
1544 | } else { | ||
1545 | goto cleanup; | ||
1546 | } | ||
1547 | } | ||
1548 | |||
1549 | /* | ||
1550 | * the bbio may be freed once we submit the last bio. Make sure | ||
1551 | * not to touch it after that | ||
1552 | */ | ||
1553 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
1554 | while (1) { | ||
1555 | bio = bio_list_pop(&bio_list); | ||
1556 | if (!bio) | ||
1557 | break; | ||
1558 | |||
1559 | bio->bi_private = rbio; | ||
1560 | bio->bi_end_io = raid_recover_end_io; | ||
1561 | |||
1562 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
1563 | BTRFS_WQ_ENDIO_RAID56); | ||
1564 | |||
1565 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1566 | submit_bio(READ, bio); | ||
1567 | } | ||
1568 | out: | ||
1569 | return 0; | ||
1570 | |||
1571 | cleanup: | ||
1572 | if (rbio->read_rebuild) | ||
1573 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1574 | return -EIO; | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * the main entry point for reads from the higher layers. This | ||
1579 | * is really only called when the normal read path had a failure, | ||
1580 | * so we assume the bio they send down corresponds to a failed part | ||
1581 | * of the drive. | ||
1582 | */ | ||
1583 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
1584 | struct btrfs_bio *bbio, u64 *raid_map, | ||
1585 | u64 stripe_len, int mirror_num) | ||
1586 | { | ||
1587 | struct btrfs_raid_bio *rbio; | ||
1588 | int ret; | ||
1589 | |||
1590 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
1591 | if (IS_ERR(rbio)) { | ||
1592 | return PTR_ERR(rbio); | ||
1593 | } | ||
1594 | |||
1595 | rbio->read_rebuild = 1; | ||
1596 | bio_list_add(&rbio->bio_list, bio); | ||
1597 | rbio->bio_list_bytes = bio->bi_size; | ||
1598 | |||
1599 | rbio->faila = find_logical_bio_stripe(rbio, bio); | ||
1600 | if (rbio->faila == -1) { | ||
1601 | BUG(); | ||
1602 | kfree(rbio); | ||
1603 | return -EIO; | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * reconstruct from the q stripe if they are | ||
1608 | * asking for mirror 3 | ||
1609 | */ | ||
1610 | if (mirror_num == 3) | ||
1611 | rbio->failb = bbio->num_stripes - 2; | ||
1612 | |||
1613 | ret = lock_stripe_add(rbio); | ||
1614 | |||
1615 | /* | ||
1616 | * __raid56_parity_recover will end the bio with | ||
1617 | * any errors it hits. We don't want to return | ||
1618 | * its error value up the stack because our caller | ||
1619 | * will end up calling bio_endio with any nonzero | ||
1620 | * return | ||
1621 | */ | ||
1622 | if (ret == 0) | ||
1623 | __raid56_parity_recover(rbio); | ||
1624 | /* | ||
1625 | * our rbio has been added to the list of | ||
1626 | * rbios that will be handled after the | ||
1627 | * currently lock owner is done | ||
1628 | */ | ||
1629 | return 0; | ||
1630 | |||
1631 | } | ||
1632 | |||
1633 | static void rmw_work(struct btrfs_work *work) | ||
1634 | { | ||
1635 | struct btrfs_raid_bio *rbio; | ||
1636 | |||
1637 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
1638 | raid56_rmw_stripe(rbio); | ||
1639 | } | ||
1640 | |||
1641 | static void read_rebuild_work(struct btrfs_work *work) | ||
1642 | { | ||
1643 | struct btrfs_raid_bio *rbio; | ||
1644 | |||
1645 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
1646 | __raid56_parity_recover(rbio); | ||
1647 | } | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h | |||
@@ -0,0 +1,51 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | |||
20 | #ifndef __BTRFS_RAID56__ | ||
21 | #define __BTRFS_RAID56__ | ||
22 | static inline int nr_parity_stripes(struct map_lookup *map) | ||
23 | { | ||
24 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
25 | return 1; | ||
26 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
27 | return 2; | ||
28 | else | ||
29 | return 0; | ||
30 | } | ||
31 | |||
32 | static inline int nr_data_stripes(struct map_lookup *map) | ||
33 | { | ||
34 | return map->num_stripes - nr_parity_stripes(map); | ||
35 | } | ||
36 | #define RAID5_P_STRIPE ((u64)-2) | ||
37 | #define RAID6_Q_STRIPE ((u64)-1) | ||
38 | |||
39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ | ||
40 | ((x) == RAID6_Q_STRIPE)) | ||
41 | |||
42 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
43 | struct btrfs_bio *bbio, u64 *raid_map, | ||
44 | u64 stripe_len, int mirror_num); | ||
45 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
46 | struct btrfs_bio *bbio, u64 *raid_map, | ||
47 | u64 stripe_len); | ||
48 | |||
49 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); | ||
50 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); | ||
51 | #endif | ||
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bdbb94f245c9..bc35ed4238b8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "dev-replace.h" | 28 | #include "dev-replace.h" |
29 | #include "check-integrity.h" | 29 | #include "check-integrity.h" |
30 | #include "rcu-string.h" | 30 | #include "rcu-string.h" |
31 | #include "raid56.h" | ||
31 | 32 | ||
32 | /* | 33 | /* |
33 | * This is only the first step towards a full-features scrub. It reads all | 34 | * This is only the first step towards a full-features scrub. It reads all |
@@ -2246,6 +2247,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2246 | struct btrfs_device *extent_dev; | 2247 | struct btrfs_device *extent_dev; |
2247 | int extent_mirror_num; | 2248 | int extent_mirror_num; |
2248 | 2249 | ||
2250 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
2251 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
2252 | if (num >= nr_data_stripes(map)) { | ||
2253 | return 0; | ||
2254 | } | ||
2255 | } | ||
2256 | |||
2249 | nstripes = length; | 2257 | nstripes = length; |
2250 | offset = 0; | 2258 | offset = 0; |
2251 | do_div(nstripes, map->stripe_len); | 2259 | do_div(nstripes, map->stripe_len); |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea5..a065dec0e330 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -686,7 +686,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
686 | struct extent_state *cached_state = NULL; | 686 | struct extent_state *cached_state = NULL; |
687 | u64 start = 0; | 687 | u64 start = 0; |
688 | u64 end; | 688 | u64 end; |
689 | struct blk_plug plug; | ||
689 | 690 | ||
691 | blk_start_plug(&plug); | ||
690 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 692 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
691 | mark, &cached_state)) { | 693 | mark, &cached_state)) { |
692 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 694 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, |
@@ -700,6 +702,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
700 | } | 702 | } |
701 | if (err) | 703 | if (err) |
702 | werr = err; | 704 | werr = err; |
705 | blk_finish_plug(&plug); | ||
703 | return werr; | 706 | return werr; |
704 | } | 707 | } |
705 | 708 | ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 485a5423e3c6..c372264b85bf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <linux/raid/pq.h> | ||
29 | #include <asm/div64.h> | ||
28 | #include "compat.h" | 30 | #include "compat.h" |
29 | #include "ctree.h" | 31 | #include "ctree.h" |
30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
@@ -32,6 +34,7 @@ | |||
32 | #include "transaction.h" | 34 | #include "transaction.h" |
33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
34 | #include "volumes.h" | 36 | #include "volumes.h" |
37 | #include "raid56.h" | ||
35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
@@ -1389,6 +1392,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1389 | } | 1392 | } |
1390 | btrfs_dev_replace_unlock(&root->fs_info->dev_replace); | 1393 | btrfs_dev_replace_unlock(&root->fs_info->dev_replace); |
1391 | 1394 | ||
1395 | if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 | | ||
1396 | BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) { | ||
1397 | printk(KERN_ERR "btrfs: unable to go below three devices " | ||
1398 | "on raid5 or raid6\n"); | ||
1399 | ret = -EINVAL; | ||
1400 | goto out; | ||
1401 | } | ||
1402 | |||
1392 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { | 1403 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { |
1393 | printk(KERN_ERR "btrfs: unable to go below four devices " | 1404 | printk(KERN_ERR "btrfs: unable to go below four devices " |
1394 | "on raid10\n"); | 1405 | "on raid10\n"); |
@@ -1403,6 +1414,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1403 | goto out; | 1414 | goto out; |
1404 | } | 1415 | } |
1405 | 1416 | ||
1417 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
1418 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1419 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1420 | "devices on raid5\n"); | ||
1421 | ret = -EINVAL; | ||
1422 | goto out; | ||
1423 | } | ||
1424 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
1425 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
1426 | printk(KERN_ERR "btrfs: unable to go below three " | ||
1427 | "devices on raid6\n"); | ||
1428 | ret = -EINVAL; | ||
1429 | goto out; | ||
1430 | } | ||
1431 | |||
1406 | if (strcmp(device_path, "missing") == 0) { | 1432 | if (strcmp(device_path, "missing") == 0) { |
1407 | struct list_head *devices; | 1433 | struct list_head *devices; |
1408 | struct btrfs_device *tmp; | 1434 | struct btrfs_device *tmp; |
@@ -2657,11 +2683,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
2657 | return 0; | 2683 | return 0; |
2658 | 2684 | ||
2659 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2685 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
2660 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2686 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
2661 | factor = 2; | 2687 | factor = num_stripes / 2; |
2662 | else | 2688 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
2663 | factor = 1; | 2689 | factor = num_stripes - 1; |
2664 | factor = num_stripes / factor; | 2690 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
2691 | factor = num_stripes - 2; | ||
2692 | } else { | ||
2693 | factor = num_stripes; | ||
2694 | } | ||
2665 | 2695 | ||
2666 | for (i = 0; i < num_stripes; i++) { | 2696 | for (i = 0; i < num_stripes; i++) { |
2667 | stripe = btrfs_stripe_nr(chunk, i); | 2697 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -2976,6 +3006,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
2976 | int mixed = 0; | 3006 | int mixed = 0; |
2977 | int ret; | 3007 | int ret; |
2978 | u64 num_devices; | 3008 | u64 num_devices; |
3009 | int cancel = 0; | ||
2979 | 3010 | ||
2980 | if (btrfs_fs_closing(fs_info) || | 3011 | if (btrfs_fs_closing(fs_info) || |
2981 | atomic_read(&fs_info->balance_pause_req) || | 3012 | atomic_read(&fs_info->balance_pause_req) || |
@@ -3018,7 +3049,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3018 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3049 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
3019 | else | 3050 | else |
3020 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3051 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
3021 | BTRFS_BLOCK_GROUP_RAID10); | 3052 | BTRFS_BLOCK_GROUP_RAID10 | |
3053 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3054 | BTRFS_BLOCK_GROUP_RAID6); | ||
3022 | 3055 | ||
3023 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3056 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3024 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3057 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
@@ -3058,7 +3091,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3058 | 3091 | ||
3059 | /* allow to reduce meta or sys integrity only if force set */ | 3092 | /* allow to reduce meta or sys integrity only if force set */ |
3060 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3093 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
3061 | BTRFS_BLOCK_GROUP_RAID10; | 3094 | BTRFS_BLOCK_GROUP_RAID10 | |
3095 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3096 | BTRFS_BLOCK_GROUP_RAID6; | ||
3097 | |||
3062 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3098 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3063 | (fs_info->avail_system_alloc_bits & allowed) && | 3099 | (fs_info->avail_system_alloc_bits & allowed) && |
3064 | !(bctl->sys.target & allowed)) || | 3100 | !(bctl->sys.target & allowed)) || |
@@ -3124,15 +3160,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3124 | } | 3160 | } |
3125 | 3161 | ||
3126 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | 3162 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || |
3127 | balance_need_close(fs_info)) { | 3163 | balance_need_close(fs_info)) |
3128 | __cancel_balance(fs_info); | 3164 | cancel = 1; |
3129 | } | ||
3130 | 3165 | ||
3131 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3166 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
3132 | fs_info->num_tolerated_disk_barrier_failures = | 3167 | fs_info->num_tolerated_disk_barrier_failures = |
3133 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); | 3168 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); |
3134 | } | 3169 | } |
3135 | 3170 | ||
3171 | if (cancel) | ||
3172 | __cancel_balance(fs_info); | ||
3173 | |||
3136 | wake_up(&fs_info->balance_wait_q); | 3174 | wake_up(&fs_info->balance_wait_q); |
3137 | 3175 | ||
3138 | return ret; | 3176 | return ret; |
@@ -3493,13 +3531,45 @@ static int btrfs_cmp_device_info(const void *a, const void *b) | |||
3493 | } | 3531 | } |
3494 | 3532 | ||
3495 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | 3533 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
3534 | /* | ||
3535 | * sub_stripes info for map, | ||
3536 | * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise | ||
3537 | * devs_max -- max devices per stripe, 0 for unlimited | ||
3538 | * devs_min -- min devices per stripe | ||
3539 | * devs_increment -- ndevs must be a multiple of this | ||
3540 | * ncopies -- how many copies of the data we have | ||
3541 | */ | ||
3496 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, | 3542 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, |
3497 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, | 3543 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, |
3498 | { 1, 2, 1, 1, 1, 2 /* dup */ }, | 3544 | { 1, 2, 1, 1, 1, 2 /* dup */ }, |
3499 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, | 3545 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, |
3500 | { 1, 1, 0, 1, 1, 1 /* single */ }, | 3546 | { 1, 1, 0, 1, 1, 1 /* single */ }, |
3547 | { 1, 1, 0, 2, 1, 2 /* raid5 */ }, | ||
3548 | { 1, 1, 0, 3, 1, 3 /* raid6 */ }, | ||
3501 | }; | 3549 | }; |
3502 | 3550 | ||
3551 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
3552 | { | ||
3553 | /* TODO allow them to set a preferred stripe size */ | ||
3554 | return 64 * 1024; | ||
3555 | } | ||
3556 | |||
3557 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
3558 | { | ||
3559 | u64 features; | ||
3560 | |||
3561 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
3562 | return; | ||
3563 | |||
3564 | features = btrfs_super_incompat_flags(info->super_copy); | ||
3565 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
3566 | return; | ||
3567 | |||
3568 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
3569 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
3570 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
3571 | } | ||
3572 | |||
3503 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3573 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3504 | struct btrfs_root *extent_root, | 3574 | struct btrfs_root *extent_root, |
3505 | struct map_lookup **map_ret, | 3575 | struct map_lookup **map_ret, |
@@ -3515,6 +3585,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3515 | struct btrfs_device_info *devices_info = NULL; | 3585 | struct btrfs_device_info *devices_info = NULL; |
3516 | u64 total_avail; | 3586 | u64 total_avail; |
3517 | int num_stripes; /* total number of stripes to allocate */ | 3587 | int num_stripes; /* total number of stripes to allocate */ |
3588 | int data_stripes; /* number of stripes that count for | ||
3589 | block group size */ | ||
3518 | int sub_stripes; /* sub_stripes info for map */ | 3590 | int sub_stripes; /* sub_stripes info for map */ |
3519 | int dev_stripes; /* stripes per dev */ | 3591 | int dev_stripes; /* stripes per dev */ |
3520 | int devs_max; /* max devs to use */ | 3592 | int devs_max; /* max devs to use */ |
@@ -3526,6 +3598,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3526 | u64 max_chunk_size; | 3598 | u64 max_chunk_size; |
3527 | u64 stripe_size; | 3599 | u64 stripe_size; |
3528 | u64 num_bytes; | 3600 | u64 num_bytes; |
3601 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
3529 | int ndevs; | 3602 | int ndevs; |
3530 | int i; | 3603 | int i; |
3531 | int j; | 3604 | int j; |
@@ -3651,16 +3724,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3651 | stripe_size = devices_info[ndevs-1].max_avail; | 3724 | stripe_size = devices_info[ndevs-1].max_avail; |
3652 | num_stripes = ndevs * dev_stripes; | 3725 | num_stripes = ndevs * dev_stripes; |
3653 | 3726 | ||
3727 | /* | ||
3728 | * this will have to be fixed for RAID1 and RAID10 over | ||
3729 | * more drives | ||
3730 | */ | ||
3731 | data_stripes = num_stripes / ncopies; | ||
3732 | |||
3654 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3733 | if (stripe_size * ndevs > max_chunk_size * ncopies) { |
3655 | stripe_size = max_chunk_size * ncopies; | 3734 | stripe_size = max_chunk_size * ncopies; |
3656 | do_div(stripe_size, ndevs); | 3735 | do_div(stripe_size, ndevs); |
3657 | } | 3736 | } |
3658 | 3737 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | |
3738 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
3739 | btrfs_super_stripesize(info->super_copy)); | ||
3740 | data_stripes = num_stripes - 1; | ||
3741 | } | ||
3742 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
3743 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
3744 | btrfs_super_stripesize(info->super_copy)); | ||
3745 | data_stripes = num_stripes - 2; | ||
3746 | } | ||
3659 | do_div(stripe_size, dev_stripes); | 3747 | do_div(stripe_size, dev_stripes); |
3660 | 3748 | ||
3661 | /* align to BTRFS_STRIPE_LEN */ | 3749 | /* align to BTRFS_STRIPE_LEN */ |
3662 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3750 | do_div(stripe_size, raid_stripe_len); |
3663 | stripe_size *= BTRFS_STRIPE_LEN; | 3751 | stripe_size *= raid_stripe_len; |
3664 | 3752 | ||
3665 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3753 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
3666 | if (!map) { | 3754 | if (!map) { |
@@ -3678,14 +3766,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3678 | } | 3766 | } |
3679 | } | 3767 | } |
3680 | map->sector_size = extent_root->sectorsize; | 3768 | map->sector_size = extent_root->sectorsize; |
3681 | map->stripe_len = BTRFS_STRIPE_LEN; | 3769 | map->stripe_len = raid_stripe_len; |
3682 | map->io_align = BTRFS_STRIPE_LEN; | 3770 | map->io_align = raid_stripe_len; |
3683 | map->io_width = BTRFS_STRIPE_LEN; | 3771 | map->io_width = raid_stripe_len; |
3684 | map->type = type; | 3772 | map->type = type; |
3685 | map->sub_stripes = sub_stripes; | 3773 | map->sub_stripes = sub_stripes; |
3686 | 3774 | ||
3687 | *map_ret = map; | 3775 | *map_ret = map; |
3688 | num_bytes = stripe_size * (num_stripes / ncopies); | 3776 | num_bytes = stripe_size * data_stripes; |
3689 | 3777 | ||
3690 | *stripe_size_out = stripe_size; | 3778 | *stripe_size_out = stripe_size; |
3691 | *num_bytes_out = num_bytes; | 3779 | *num_bytes_out = num_bytes; |
@@ -3734,6 +3822,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3734 | } | 3822 | } |
3735 | } | 3823 | } |
3736 | 3824 | ||
3825 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
3826 | |||
3737 | kfree(devices_info); | 3827 | kfree(devices_info); |
3738 | return 0; | 3828 | return 0; |
3739 | 3829 | ||
@@ -4003,6 +4093,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4003 | ret = map->num_stripes; | 4093 | ret = map->num_stripes; |
4004 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4094 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4005 | ret = map->sub_stripes; | 4095 | ret = map->sub_stripes; |
4096 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
4097 | ret = 2; | ||
4098 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4099 | ret = 3; | ||
4006 | else | 4100 | else |
4007 | ret = 1; | 4101 | ret = 1; |
4008 | free_extent_map(em); | 4102 | free_extent_map(em); |
@@ -4015,6 +4109,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4015 | return ret; | 4109 | return ret; |
4016 | } | 4110 | } |
4017 | 4111 | ||
4112 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
4113 | struct btrfs_mapping_tree *map_tree, | ||
4114 | u64 logical) | ||
4115 | { | ||
4116 | struct extent_map *em; | ||
4117 | struct map_lookup *map; | ||
4118 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4119 | unsigned long len = root->sectorsize; | ||
4120 | |||
4121 | read_lock(&em_tree->lock); | ||
4122 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4123 | read_unlock(&em_tree->lock); | ||
4124 | BUG_ON(!em); | ||
4125 | |||
4126 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4127 | map = (struct map_lookup *)em->bdev; | ||
4128 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4129 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4130 | len = map->stripe_len * nr_data_stripes(map); | ||
4131 | } | ||
4132 | free_extent_map(em); | ||
4133 | return len; | ||
4134 | } | ||
4135 | |||
4136 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
4137 | u64 logical, u64 len, int mirror_num) | ||
4138 | { | ||
4139 | struct extent_map *em; | ||
4140 | struct map_lookup *map; | ||
4141 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4142 | int ret = 0; | ||
4143 | |||
4144 | read_lock(&em_tree->lock); | ||
4145 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4146 | read_unlock(&em_tree->lock); | ||
4147 | BUG_ON(!em); | ||
4148 | |||
4149 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4150 | map = (struct map_lookup *)em->bdev; | ||
4151 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4152 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4153 | ret = 1; | ||
4154 | free_extent_map(em); | ||
4155 | return ret; | ||
4156 | } | ||
4157 | |||
4018 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4158 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
4019 | struct map_lookup *map, int first, int num, | 4159 | struct map_lookup *map, int first, int num, |
4020 | int optimal, int dev_replace_is_ongoing) | 4160 | int optimal, int dev_replace_is_ongoing) |
@@ -4052,10 +4192,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
4052 | return optimal; | 4192 | return optimal; |
4053 | } | 4193 | } |
4054 | 4194 | ||
4195 | static inline int parity_smaller(u64 a, u64 b) | ||
4196 | { | ||
4197 | return a > b; | ||
4198 | } | ||
4199 | |||
4200 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
4201 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
4202 | { | ||
4203 | struct btrfs_bio_stripe s; | ||
4204 | int i; | ||
4205 | u64 l; | ||
4206 | int again = 1; | ||
4207 | |||
4208 | while (again) { | ||
4209 | again = 0; | ||
4210 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
4211 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
4212 | s = bbio->stripes[i]; | ||
4213 | l = raid_map[i]; | ||
4214 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
4215 | raid_map[i] = raid_map[i+1]; | ||
4216 | bbio->stripes[i+1] = s; | ||
4217 | raid_map[i+1] = l; | ||
4218 | again = 1; | ||
4219 | } | ||
4220 | } | ||
4221 | } | ||
4222 | } | ||
4223 | |||
4055 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4224 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4056 | u64 logical, u64 *length, | 4225 | u64 logical, u64 *length, |
4057 | struct btrfs_bio **bbio_ret, | 4226 | struct btrfs_bio **bbio_ret, |
4058 | int mirror_num) | 4227 | int mirror_num, u64 **raid_map_ret) |
4059 | { | 4228 | { |
4060 | struct extent_map *em; | 4229 | struct extent_map *em; |
4061 | struct map_lookup *map; | 4230 | struct map_lookup *map; |
@@ -4067,6 +4236,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4067 | u64 stripe_nr; | 4236 | u64 stripe_nr; |
4068 | u64 stripe_nr_orig; | 4237 | u64 stripe_nr_orig; |
4069 | u64 stripe_nr_end; | 4238 | u64 stripe_nr_end; |
4239 | u64 stripe_len; | ||
4240 | u64 *raid_map = NULL; | ||
4070 | int stripe_index; | 4241 | int stripe_index; |
4071 | int i; | 4242 | int i; |
4072 | int ret = 0; | 4243 | int ret = 0; |
@@ -4078,6 +4249,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4078 | int num_alloc_stripes; | 4249 | int num_alloc_stripes; |
4079 | int patch_the_first_stripe_for_dev_replace = 0; | 4250 | int patch_the_first_stripe_for_dev_replace = 0; |
4080 | u64 physical_to_patch_in_first_stripe = 0; | 4251 | u64 physical_to_patch_in_first_stripe = 0; |
4252 | u64 raid56_full_stripe_start = (u64)-1; | ||
4081 | 4253 | ||
4082 | read_lock(&em_tree->lock); | 4254 | read_lock(&em_tree->lock); |
4083 | em = lookup_extent_mapping(em_tree, logical, *length); | 4255 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -4094,29 +4266,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4094 | map = (struct map_lookup *)em->bdev; | 4266 | map = (struct map_lookup *)em->bdev; |
4095 | offset = logical - em->start; | 4267 | offset = logical - em->start; |
4096 | 4268 | ||
4269 | if (mirror_num > map->num_stripes) | ||
4270 | mirror_num = 0; | ||
4271 | |||
4272 | stripe_len = map->stripe_len; | ||
4097 | stripe_nr = offset; | 4273 | stripe_nr = offset; |
4098 | /* | 4274 | /* |
4099 | * stripe_nr counts the total number of stripes we have to stride | 4275 | * stripe_nr counts the total number of stripes we have to stride |
4100 | * to get to this block | 4276 | * to get to this block |
4101 | */ | 4277 | */ |
4102 | do_div(stripe_nr, map->stripe_len); | 4278 | do_div(stripe_nr, stripe_len); |
4103 | 4279 | ||
4104 | stripe_offset = stripe_nr * map->stripe_len; | 4280 | stripe_offset = stripe_nr * stripe_len; |
4105 | BUG_ON(offset < stripe_offset); | 4281 | BUG_ON(offset < stripe_offset); |
4106 | 4282 | ||
4107 | /* stripe_offset is the offset of this block in its stripe*/ | 4283 | /* stripe_offset is the offset of this block in its stripe*/ |
4108 | stripe_offset = offset - stripe_offset; | 4284 | stripe_offset = offset - stripe_offset; |
4109 | 4285 | ||
4110 | if (rw & REQ_DISCARD) | 4286 | /* if we're here for raid56, we need to know the stripe aligned start */ |
4287 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4288 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
4289 | raid56_full_stripe_start = offset; | ||
4290 | |||
4291 | /* allow a write of a full stripe, but make sure we don't | ||
4292 | * allow straddling of stripes | ||
4293 | */ | ||
4294 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
4295 | raid56_full_stripe_start *= full_stripe_len; | ||
4296 | } | ||
4297 | |||
4298 | if (rw & REQ_DISCARD) { | ||
4299 | /* we don't discard raid56 yet */ | ||
4300 | if (map->type & | ||
4301 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4302 | ret = -EOPNOTSUPP; | ||
4303 | goto out; | ||
4304 | } | ||
4111 | *length = min_t(u64, em->len - offset, *length); | 4305 | *length = min_t(u64, em->len - offset, *length); |
4112 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4306 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
4113 | /* we limit the length of each bio to what fits in a stripe */ | 4307 | u64 max_len; |
4114 | *length = min_t(u64, em->len - offset, | 4308 | /* For writes to RAID[56], allow a full stripeset across all disks. |
4115 | map->stripe_len - stripe_offset); | 4309 | For other RAID types and for RAID[56] reads, just allow a single |
4310 | stripe (on a single disk). */ | ||
4311 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
4312 | (rw & REQ_WRITE)) { | ||
4313 | max_len = stripe_len * nr_data_stripes(map) - | ||
4314 | (offset - raid56_full_stripe_start); | ||
4315 | } else { | ||
4316 | /* we limit the length of each bio to what fits in a stripe */ | ||
4317 | max_len = stripe_len - stripe_offset; | ||
4318 | } | ||
4319 | *length = min_t(u64, em->len - offset, max_len); | ||
4116 | } else { | 4320 | } else { |
4117 | *length = em->len - offset; | 4321 | *length = em->len - offset; |
4118 | } | 4322 | } |
4119 | 4323 | ||
4324 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
4325 | it cares about is the length */ | ||
4120 | if (!bbio_ret) | 4326 | if (!bbio_ret) |
4121 | goto out; | 4327 | goto out; |
4122 | 4328 | ||
@@ -4149,7 +4355,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4149 | u64 physical_of_found = 0; | 4355 | u64 physical_of_found = 0; |
4150 | 4356 | ||
4151 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4357 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
4152 | logical, &tmp_length, &tmp_bbio, 0); | 4358 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
4153 | if (ret) { | 4359 | if (ret) { |
4154 | WARN_ON(tmp_bbio != NULL); | 4360 | WARN_ON(tmp_bbio != NULL); |
4155 | goto out; | 4361 | goto out; |
@@ -4215,6 +4421,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4215 | do_div(stripe_nr_end, map->stripe_len); | 4421 | do_div(stripe_nr_end, map->stripe_len); |
4216 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4422 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
4217 | (offset + *length); | 4423 | (offset + *length); |
4424 | |||
4218 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4425 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4219 | if (rw & REQ_DISCARD) | 4426 | if (rw & REQ_DISCARD) |
4220 | num_stripes = min_t(u64, map->num_stripes, | 4427 | num_stripes = min_t(u64, map->num_stripes, |
@@ -4265,6 +4472,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4265 | dev_replace_is_ongoing); | 4472 | dev_replace_is_ongoing); |
4266 | mirror_num = stripe_index - old_stripe_index + 1; | 4473 | mirror_num = stripe_index - old_stripe_index + 1; |
4267 | } | 4474 | } |
4475 | |||
4476 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4477 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4478 | u64 tmp; | ||
4479 | |||
4480 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
4481 | && raid_map_ret) { | ||
4482 | int i, rot; | ||
4483 | |||
4484 | /* push stripe_nr back to the start of the full stripe */ | ||
4485 | stripe_nr = raid56_full_stripe_start; | ||
4486 | do_div(stripe_nr, stripe_len); | ||
4487 | |||
4488 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4489 | |||
4490 | /* RAID[56] write or recovery. Return all stripes */ | ||
4491 | num_stripes = map->num_stripes; | ||
4492 | max_errors = nr_parity_stripes(map); | ||
4493 | |||
4494 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
4495 | GFP_NOFS); | ||
4496 | if (!raid_map) { | ||
4497 | ret = -ENOMEM; | ||
4498 | goto out; | ||
4499 | } | ||
4500 | |||
4501 | /* Work out the disk rotation on this stripe-set */ | ||
4502 | tmp = stripe_nr; | ||
4503 | rot = do_div(tmp, num_stripes); | ||
4504 | |||
4505 | /* Fill in the logical address of each stripe */ | ||
4506 | tmp = stripe_nr * nr_data_stripes(map); | ||
4507 | for (i = 0; i < nr_data_stripes(map); i++) | ||
4508 | raid_map[(i+rot) % num_stripes] = | ||
4509 | em->start + (tmp + i) * map->stripe_len; | ||
4510 | |||
4511 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
4512 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4513 | raid_map[(i+rot+1) % num_stripes] = | ||
4514 | RAID6_Q_STRIPE; | ||
4515 | |||
4516 | *length = map->stripe_len; | ||
4517 | stripe_index = 0; | ||
4518 | stripe_offset = 0; | ||
4519 | } else { | ||
4520 | /* | ||
4521 | * Mirror #0 or #1 means the original data block. | ||
4522 | * Mirror #2 is RAID5 parity block. | ||
4523 | * Mirror #3 is RAID6 Q block. | ||
4524 | */ | ||
4525 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4526 | if (mirror_num > 1) | ||
4527 | stripe_index = nr_data_stripes(map) + | ||
4528 | mirror_num - 2; | ||
4529 | |||
4530 | /* We distribute the parity blocks across stripes */ | ||
4531 | tmp = stripe_nr + stripe_index; | ||
4532 | stripe_index = do_div(tmp, map->num_stripes); | ||
4533 | } | ||
4268 | } else { | 4534 | } else { |
4269 | /* | 4535 | /* |
4270 | * after this do_div call, stripe_nr is the number of stripes | 4536 | * after this do_div call, stripe_nr is the number of stripes |
@@ -4373,8 +4639,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4373 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4639 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4374 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4640 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4375 | BTRFS_BLOCK_GROUP_RAID10 | | 4641 | BTRFS_BLOCK_GROUP_RAID10 | |
4642 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4376 | BTRFS_BLOCK_GROUP_DUP)) { | 4643 | BTRFS_BLOCK_GROUP_DUP)) { |
4377 | max_errors = 1; | 4644 | max_errors = 1; |
4645 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
4646 | max_errors = 2; | ||
4378 | } | 4647 | } |
4379 | } | 4648 | } |
4380 | 4649 | ||
@@ -4475,6 +4744,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4475 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4744 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
4476 | bbio->mirror_num = map->num_stripes + 1; | 4745 | bbio->mirror_num = map->num_stripes + 1; |
4477 | } | 4746 | } |
4747 | if (raid_map) { | ||
4748 | sort_parity_stripes(bbio, raid_map); | ||
4749 | *raid_map_ret = raid_map; | ||
4750 | } | ||
4478 | out: | 4751 | out: |
4479 | if (dev_replace_is_ongoing) | 4752 | if (dev_replace_is_ongoing) |
4480 | btrfs_dev_replace_unlock(dev_replace); | 4753 | btrfs_dev_replace_unlock(dev_replace); |
@@ -4487,7 +4760,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4487 | struct btrfs_bio **bbio_ret, int mirror_num) | 4760 | struct btrfs_bio **bbio_ret, int mirror_num) |
4488 | { | 4761 | { |
4489 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4762 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4490 | mirror_num); | 4763 | mirror_num, NULL); |
4491 | } | 4764 | } |
4492 | 4765 | ||
4493 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4766 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -4501,6 +4774,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4501 | u64 bytenr; | 4774 | u64 bytenr; |
4502 | u64 length; | 4775 | u64 length; |
4503 | u64 stripe_nr; | 4776 | u64 stripe_nr; |
4777 | u64 rmap_len; | ||
4504 | int i, j, nr = 0; | 4778 | int i, j, nr = 0; |
4505 | 4779 | ||
4506 | read_lock(&em_tree->lock); | 4780 | read_lock(&em_tree->lock); |
@@ -4511,10 +4785,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4511 | map = (struct map_lookup *)em->bdev; | 4785 | map = (struct map_lookup *)em->bdev; |
4512 | 4786 | ||
4513 | length = em->len; | 4787 | length = em->len; |
4788 | rmap_len = map->stripe_len; | ||
4789 | |||
4514 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4790 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4515 | do_div(length, map->num_stripes / map->sub_stripes); | 4791 | do_div(length, map->num_stripes / map->sub_stripes); |
4516 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4792 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
4517 | do_div(length, map->num_stripes); | 4793 | do_div(length, map->num_stripes); |
4794 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4795 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4796 | do_div(length, nr_data_stripes(map)); | ||
4797 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
4798 | } | ||
4518 | 4799 | ||
4519 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4800 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
4520 | BUG_ON(!buf); /* -ENOMEM */ | 4801 | BUG_ON(!buf); /* -ENOMEM */ |
@@ -4534,8 +4815,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4534 | do_div(stripe_nr, map->sub_stripes); | 4815 | do_div(stripe_nr, map->sub_stripes); |
4535 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4816 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4536 | stripe_nr = stripe_nr * map->num_stripes + i; | 4817 | stripe_nr = stripe_nr * map->num_stripes + i; |
4537 | } | 4818 | } /* else if RAID[56], multiply by nr_data_stripes(). |
4538 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4819 | * Alternatively, just use rmap_len below instead of |
4820 | * map->stripe_len */ | ||
4821 | |||
4822 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
4539 | WARN_ON(nr >= map->num_stripes); | 4823 | WARN_ON(nr >= map->num_stripes); |
4540 | for (j = 0; j < nr; j++) { | 4824 | for (j = 0; j < nr; j++) { |
4541 | if (buf[j] == bytenr) | 4825 | if (buf[j] == bytenr) |
@@ -4549,7 +4833,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4549 | 4833 | ||
4550 | *logical = buf; | 4834 | *logical = buf; |
4551 | *naddrs = nr; | 4835 | *naddrs = nr; |
4552 | *stripe_len = map->stripe_len; | 4836 | *stripe_len = rmap_len; |
4553 | 4837 | ||
4554 | free_extent_map(em); | 4838 | free_extent_map(em); |
4555 | return 0; | 4839 | return 0; |
@@ -4623,7 +4907,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
4623 | bio->bi_bdev = (struct block_device *) | 4907 | bio->bi_bdev = (struct block_device *) |
4624 | (unsigned long)bbio->mirror_num; | 4908 | (unsigned long)bbio->mirror_num; |
4625 | /* only send an error to the higher layers if it is | 4909 | /* only send an error to the higher layers if it is |
4626 | * beyond the tolerance of the multi-bio | 4910 | * beyond the tolerance of the btrfs bio |
4627 | */ | 4911 | */ |
4628 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 4912 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
4629 | err = -EIO; | 4913 | err = -EIO; |
@@ -4657,13 +4941,18 @@ struct async_sched { | |||
4657 | * This will add one bio to the pending list for a device and make sure | 4941 | * This will add one bio to the pending list for a device and make sure |
4658 | * the work struct is scheduled. | 4942 | * the work struct is scheduled. |
4659 | */ | 4943 | */ |
4660 | static noinline void schedule_bio(struct btrfs_root *root, | 4944 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
4661 | struct btrfs_device *device, | 4945 | struct btrfs_device *device, |
4662 | int rw, struct bio *bio) | 4946 | int rw, struct bio *bio) |
4663 | { | 4947 | { |
4664 | int should_queue = 1; | 4948 | int should_queue = 1; |
4665 | struct btrfs_pending_bios *pending_bios; | 4949 | struct btrfs_pending_bios *pending_bios; |
4666 | 4950 | ||
4951 | if (device->missing || !device->bdev) { | ||
4952 | bio_endio(bio, -EIO); | ||
4953 | return; | ||
4954 | } | ||
4955 | |||
4667 | /* don't bother with additional async steps for reads, right now */ | 4956 | /* don't bother with additional async steps for reads, right now */ |
4668 | if (!(rw & REQ_WRITE)) { | 4957 | if (!(rw & REQ_WRITE)) { |
4669 | bio_get(bio); | 4958 | bio_get(bio); |
@@ -4761,7 +5050,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
4761 | #endif | 5050 | #endif |
4762 | bio->bi_bdev = dev->bdev; | 5051 | bio->bi_bdev = dev->bdev; |
4763 | if (async) | 5052 | if (async) |
4764 | schedule_bio(root, dev, rw, bio); | 5053 | btrfs_schedule_bio(root, dev, rw, bio); |
4765 | else | 5054 | else |
4766 | btrfsic_submit_bio(rw, bio); | 5055 | btrfsic_submit_bio(rw, bio); |
4767 | } | 5056 | } |
@@ -4820,6 +5109,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4820 | u64 logical = (u64)bio->bi_sector << 9; | 5109 | u64 logical = (u64)bio->bi_sector << 9; |
4821 | u64 length = 0; | 5110 | u64 length = 0; |
4822 | u64 map_length; | 5111 | u64 map_length; |
5112 | u64 *raid_map = NULL; | ||
4823 | int ret; | 5113 | int ret; |
4824 | int dev_nr = 0; | 5114 | int dev_nr = 0; |
4825 | int total_devs = 1; | 5115 | int total_devs = 1; |
@@ -4828,12 +5118,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4828 | length = bio->bi_size; | 5118 | length = bio->bi_size; |
4829 | map_length = length; | 5119 | map_length = length; |
4830 | 5120 | ||
4831 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5121 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4832 | mirror_num); | 5122 | mirror_num, &raid_map); |
4833 | if (ret) | 5123 | if (ret) /* -ENOMEM */ |
4834 | return ret; | 5124 | return ret; |
4835 | 5125 | ||
4836 | total_devs = bbio->num_stripes; | 5126 | total_devs = bbio->num_stripes; |
5127 | bbio->orig_bio = first_bio; | ||
5128 | bbio->private = first_bio->bi_private; | ||
5129 | bbio->end_io = first_bio->bi_end_io; | ||
5130 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
5131 | |||
5132 | if (raid_map) { | ||
5133 | /* In this case, map_length has been set to the length of | ||
5134 | a single stripe; not the whole write */ | ||
5135 | if (rw & WRITE) { | ||
5136 | return raid56_parity_write(root, bio, bbio, | ||
5137 | raid_map, map_length); | ||
5138 | } else { | ||
5139 | return raid56_parity_recover(root, bio, bbio, | ||
5140 | raid_map, map_length, | ||
5141 | mirror_num); | ||
5142 | } | ||
5143 | } | ||
5144 | |||
4837 | if (map_length < length) { | 5145 | if (map_length < length) { |
4838 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5146 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
4839 | "len %llu\n", (unsigned long long)logical, | 5147 | "len %llu\n", (unsigned long long)logical, |
@@ -4842,11 +5150,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4842 | BUG(); | 5150 | BUG(); |
4843 | } | 5151 | } |
4844 | 5152 | ||
4845 | bbio->orig_bio = first_bio; | ||
4846 | bbio->private = first_bio->bi_private; | ||
4847 | bbio->end_io = first_bio->bi_end_io; | ||
4848 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
4849 | |||
4850 | while (dev_nr < total_devs) { | 5153 | while (dev_nr < total_devs) { |
4851 | dev = bbio->stripes[dev_nr].dev; | 5154 | dev = bbio->stripes[dev_nr].dev; |
4852 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5155 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac751..0c2b856ecd98 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | 321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, |
322 | struct btrfs_device *tgtdev); | 322 | struct btrfs_device *tgtdev); |
323 | int btrfs_scratch_superblock(struct btrfs_device *device); | 323 | int btrfs_scratch_superblock(struct btrfs_device *device); |
324 | 324 | void btrfs_schedule_bio(struct btrfs_root *root, | |
325 | struct btrfs_device *device, | ||
326 | int rw, struct bio *bio); | ||
327 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
328 | u64 logical, u64 len, int mirror_num); | ||
329 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
330 | struct btrfs_mapping_tree *map_tree, | ||
331 | u64 logical); | ||
325 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, | 332 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, |
326 | int index) | 333 | int index) |
327 | { | 334 | { |