aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2013-01-29 18:40:14 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit53b381b3abeb86f12787a6c40fee9b2f71edc23b (patch)
treec1018ba2157778f0200d2ede0c0df48fe5df8f14 /fs/btrfs
parent64a167011bcabc1e855658387c8a4464b71f3138 (diff)
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/ctree.h35
-rw-r--r--fs/btrfs/disk-io.c62
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c88
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/free-space-cache.c50
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/raid56.c1647
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/scrub.c8
-rw-r--r--fs/btrfs/transaction.c3
-rw-r--r--fs/btrfs/volumes.c385
-rw-r--r--fs/btrfs/volumes.h9
15 files changed, 2283 insertions, 102 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..4f5dc93fa2f8 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,8 @@ config BTRFS_FS
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS 7 select LZO_COMPRESS
8 select LZO_DECOMPRESS 8 select LZO_DECOMPRESS
9 select RAID6_PQ
10
9 help 11 help
10 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a new filesystem with extents, writable snapshotting,
11 support for multiple devices and many more features. 13 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0ab51be6879f..0cce3aafbd62 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -502,6 +502,7 @@ struct btrfs_super_block {
502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503 503
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
505#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505 506
506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 507#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 508#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +512,7 @@ struct btrfs_super_block {
511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 512 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 513 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 514 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
515 BTRFS_FEATURE_INCOMPAT_RAID56 | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 516 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515 517
516/* 518/*
@@ -952,8 +954,10 @@ struct btrfs_dev_replace_item {
952#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 954#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 955#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 956#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
957#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
958#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 959#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956#define BTRFS_NR_RAID_TYPES 5 960#define BTRFS_NR_RAID_TYPES 7
957 961
958#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 962#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959 BTRFS_BLOCK_GROUP_SYSTEM | \ 963 BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +965,8 @@ struct btrfs_dev_replace_item {
961 965
962#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 966#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963 BTRFS_BLOCK_GROUP_RAID1 | \ 967 BTRFS_BLOCK_GROUP_RAID1 | \
968 BTRFS_BLOCK_GROUP_RAID5 | \
969 BTRFS_BLOCK_GROUP_RAID6 | \
964 BTRFS_BLOCK_GROUP_DUP | \ 970 BTRFS_BLOCK_GROUP_DUP | \
965 BTRFS_BLOCK_GROUP_RAID10) 971 BTRFS_BLOCK_GROUP_RAID10)
966/* 972/*
@@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache {
1185 u64 flags; 1191 u64 flags;
1186 u64 sectorsize; 1192 u64 sectorsize;
1187 u64 cache_generation; 1193 u64 cache_generation;
1194
1195 /* for raid56, this is a full stripe, without parity */
1196 unsigned long full_stripe_len;
1197
1188 unsigned int ro:1; 1198 unsigned int ro:1;
1189 unsigned int dirty:1; 1199 unsigned int dirty:1;
1190 unsigned int iref:1; 1200 unsigned int iref:1;
@@ -1225,6 +1235,20 @@ struct seq_list {
1225 u64 seq; 1235 u64 seq;
1226}; 1236};
1227 1237
1238/* used by the raid56 code to lock stripes for read/modify/write */
1239struct btrfs_stripe_hash {
1240 struct list_head hash_list;
1241 wait_queue_head_t wait;
1242 spinlock_t lock;
1243};
1244
1245/* used by the raid56 code to lock stripes for read/modify/write */
1246struct btrfs_stripe_hash_table {
1247 struct btrfs_stripe_hash *table;
1248};
1249
1250#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1251
1228/* fs_info */ 1252/* fs_info */
1229struct reloc_control; 1253struct reloc_control;
1230struct btrfs_device; 1254struct btrfs_device;
@@ -1307,6 +1331,13 @@ struct btrfs_fs_info {
1307 struct mutex cleaner_mutex; 1331 struct mutex cleaner_mutex;
1308 struct mutex chunk_mutex; 1332 struct mutex chunk_mutex;
1309 struct mutex volume_mutex; 1333 struct mutex volume_mutex;
1334
1335 /* this is used during read/modify/write to make sure
1336 * no two ios are trying to mod the same stripe at the same
1337 * time
1338 */
1339 struct btrfs_stripe_hash_table *stripe_hash_table;
1340
1310 /* 1341 /*
1311 * this protects the ordered operations list only while we are 1342 * this protects the ordered operations list only while we are
1312 * processing all of the entries on it. This way we make 1343 * processing all of the entries on it. This way we make
@@ -1395,6 +1426,8 @@ struct btrfs_fs_info {
1395 struct btrfs_workers flush_workers; 1426 struct btrfs_workers flush_workers;
1396 struct btrfs_workers endio_workers; 1427 struct btrfs_workers endio_workers;
1397 struct btrfs_workers endio_meta_workers; 1428 struct btrfs_workers endio_meta_workers;
1429 struct btrfs_workers endio_raid56_workers;
1430 struct btrfs_workers rmw_workers;
1398 struct btrfs_workers endio_meta_write_workers; 1431 struct btrfs_workers endio_meta_write_workers;
1399 struct btrfs_workers endio_write_workers; 1432 struct btrfs_workers endio_write_workers;
1400 struct btrfs_workers endio_freespace_worker; 1433 struct btrfs_workers endio_freespace_worker;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 65f03670a952..e9fa7b4d18e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -639,8 +640,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 640 btree_readahead_hook(root, eb, eb->start, ret);
640 } 641 }
641 642
642 if (ret) 643 if (ret) {
644 /*
645 * our io error hook is going to dec the io pages
646 * again, we have to make sure it has something
647 * to decrement
648 */
649 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 650 clear_extent_buffer_uptodate(eb);
651 }
644 free_extent_buffer(eb); 652 free_extent_buffer(eb);
645out: 653out:
646 return ret; 654 return ret;
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 662 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 663 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 664 eb->read_mirror = failed_mirror;
665 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 666 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 667 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 668 return -EIO; /* we fixed nothing */
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 679 end_io_wq->work.flags = 0;
671 680
672 if (bio->bi_rw & REQ_WRITE) { 681 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 682 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 683 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 684 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 685 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 686 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 687 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
689 btrfs_queue_worker(&fs_info->endio_raid56_workers,
690 &end_io_wq->work);
679 else 691 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 692 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 693 &end_io_wq->work);
682 } else { 694 } else {
683 if (end_io_wq->metadata) 695 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
696 btrfs_queue_worker(&fs_info->endio_raid56_workers,
697 &end_io_wq->work);
698 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 699 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 700 &end_io_wq->work);
686 else 701 else
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 710 * 0 - if data
696 * 1 - if normal metadta 711 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 712 * 2 - if writing to the free space cache area
713 * 3 - raid parity work
698 */ 714 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 715int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 716 int metadata)
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2181 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2182 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2183
2184 ret = btrfs_alloc_stripe_hash_table(fs_info);
2185 if (ret) {
2186 err = -ENOMEM;
2187 goto fail_alloc;
2188 }
2189
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2190 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2191 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2192
@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2354 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2355 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2356 &fs_info->generic_worker);
2357 btrfs_init_workers(&fs_info->endio_raid56_workers,
2358 "endio-raid56", fs_info->thread_pool_size,
2359 &fs_info->generic_worker);
2360 btrfs_init_workers(&fs_info->rmw_workers,
2361 "rmw", fs_info->thread_pool_size,
2362 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2363 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2364 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2365 &fs_info->generic_worker);
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2378 */
2351 fs_info->endio_workers.idle_thresh = 4; 2379 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2380 fs_info->endio_meta_workers.idle_thresh = 4;
2381 fs_info->endio_raid56_workers.idle_thresh = 4;
2382 fs_info->rmw_workers.idle_thresh = 2;
2353 2383
2354 fs_info->endio_write_workers.idle_thresh = 2; 2384 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2385 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2396 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2397 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2398 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2399 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2400 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2401 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2402 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2403 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2710,6 +2742,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2742 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2743 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2744 btrfs_stop_workers(&fs_info->endio_meta_workers);
2745 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2746 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2747 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2748 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2749 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2728,6 +2762,7 @@ fail_bdi:
2728fail_srcu: 2762fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2763 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2764fail:
2765 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2766 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2767 return err;
2733 2768
@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3111 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3112 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3113 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3114 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3115 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3116 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3117 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3118 num_tolerated_disk_barrier_failures = 1;
3119 } else if (flags &
3120 BTRFS_BLOCK_GROUP_RAID5) {
3121 num_tolerated_disk_barrier_failures = 2;
3122 }
3123 }
3084 } 3124 }
3085 } 3125 }
3086 up_read(&sinfo->groups_sem); 3126 up_read(&sinfo->groups_sem);
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3424 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3425 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3426 btrfs_stop_workers(&fs_info->endio_meta_workers);
3427 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3428 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3429 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3430 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3431 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
3404 bdi_destroy(&fs_info->bdi); 3446 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3447 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3448
3449 btrfs_free_stripe_hash_table(fs_info);
3450
3407 return 0; 3451 return 0;
3408} 3452}
3409 3453
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d133edfcd449..3345f68fc64b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -3276,6 +3279,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3279 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3280 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3281 u64 target;
3282 u64 tmp;
3279 3283
3280 /* 3284 /*
3281 * see if restripe for this chunk_type is in progress, if so 3285 * see if restripe for this chunk_type is in progress, if so
@@ -3292,30 +3296,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3296 }
3293 spin_unlock(&root->fs_info->balance_lock); 3297 spin_unlock(&root->fs_info->balance_lock);
3294 3298
3299 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3300 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3301 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3302 BTRFS_BLOCK_GROUP_RAID5);
3303 if (num_devices < 3)
3304 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3305 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3306 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3307
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3308 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3309 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3310 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3311 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3312
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3313 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3314 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3315 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3316 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3317 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3318 tmp = BTRFS_BLOCK_GROUP_RAID10;
3319 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3320 tmp = BTRFS_BLOCK_GROUP_RAID1;
3321 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3322 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3323
3318 return extended_to_chunk(flags); 3324 return extended_to_chunk(flags | tmp);
3319} 3325}
3320 3326
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3327static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3333,6 +3339,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3339u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3340{
3335 u64 flags; 3341 u64 flags;
3342 u64 ret;
3336 3343
3337 if (data) 3344 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3345 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3348,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3348 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3349 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3350
3344 return get_alloc_profile(root, flags); 3351 ret = get_alloc_profile(root, flags);
3352 return ret;
3345} 3353}
3346 3354
3347/* 3355/*
@@ -3516,8 +3524,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3524{
3517 u64 num_dev; 3525 u64 num_dev;
3518 3526
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3527 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3528 BTRFS_BLOCK_GROUP_RAID0 |
3529 BTRFS_BLOCK_GROUP_RAID5 |
3530 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3531 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3532 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3533 num_dev = 2;
@@ -3667,7 +3677,9 @@ static int can_overcommit(struct btrfs_root *root,
3667 3677
3668 /* 3678 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3679 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3680 * space is actually useable. For raid56, the space info used
3681 * doesn't include the parity drive, so we don't have to
3682 * change the math
3671 */ 3683 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3684 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3685 BTRFS_BLOCK_GROUP_RAID1 |
@@ -5455,10 +5467,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5455 return ret; 5467 return ret;
5456} 5468}
5457 5469
5458static u64 stripe_align(struct btrfs_root *root, u64 val) 5470static u64 stripe_align(struct btrfs_root *root,
5471 struct btrfs_block_group_cache *cache,
5472 u64 val, u64 num_bytes)
5459{ 5473{
5460 u64 mask = ((u64)root->stripesize - 1); 5474 u64 mask;
5461 u64 ret = (val + mask) & ~mask; 5475 u64 ret;
5476 mask = ((u64)root->stripesize - 1);
5477 ret = (val + mask) & ~mask;
5462 return ret; 5478 return ret;
5463} 5479}
5464 5480
@@ -5519,9 +5535,12 @@ int __get_raid_index(u64 flags)
5519 index = 2; 5535 index = 2;
5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5536 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5521 index = 3; 5537 index = 3;
5538 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5539 index = 5;
5540 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5541 index = 6;
5522 else 5542 else
5523 index = 4; 5543 index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */
5524
5525 return index; 5544 return index;
5526} 5545}
5527 5546
@@ -5665,6 +5684,8 @@ search:
5665 if (!block_group_bits(block_group, data)) { 5684 if (!block_group_bits(block_group, data)) {
5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5685 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5667 BTRFS_BLOCK_GROUP_RAID1 | 5686 BTRFS_BLOCK_GROUP_RAID1 |
5687 BTRFS_BLOCK_GROUP_RAID5 |
5688 BTRFS_BLOCK_GROUP_RAID6 |
5668 BTRFS_BLOCK_GROUP_RAID10; 5689 BTRFS_BLOCK_GROUP_RAID10;
5669 5690
5670 /* 5691 /*
@@ -5835,7 +5856,8 @@ unclustered_alloc:
5835 goto loop; 5856 goto loop;
5836 } 5857 }
5837checks: 5858checks:
5838 search_start = stripe_align(root, offset); 5859 search_start = stripe_align(root, used_block_group,
5860 offset, num_bytes);
5839 5861
5840 /* move on to the next group */ 5862 /* move on to the next group */
5841 if (search_start + num_bytes > 5863 if (search_start + num_bytes >
@@ -7203,6 +7225,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7203 root->fs_info->fs_devices->missing_devices; 7225 root->fs_info->fs_devices->missing_devices;
7204 7226
7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7227 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7228 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7206 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7229 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7207 7230
7208 if (num_devices == 1) { 7231 if (num_devices == 1) {
@@ -7754,7 +7777,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7754 btrfs_release_path(path); 7777 btrfs_release_path(path);
7755 cache->flags = btrfs_block_group_flags(&cache->item); 7778 cache->flags = btrfs_block_group_flags(&cache->item);
7756 cache->sectorsize = root->sectorsize; 7779 cache->sectorsize = root->sectorsize;
7757 7780 cache->full_stripe_len = btrfs_full_stripe_len(root,
7781 &root->fs_info->mapping_tree,
7782 found_key.objectid);
7758 btrfs_init_free_space_ctl(cache); 7783 btrfs_init_free_space_ctl(cache);
7759 7784
7760 /* 7785 /*
@@ -7808,6 +7833,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7808 if (!(get_alloc_profile(root, space_info->flags) & 7833 if (!(get_alloc_profile(root, space_info->flags) &
7809 (BTRFS_BLOCK_GROUP_RAID10 | 7834 (BTRFS_BLOCK_GROUP_RAID10 |
7810 BTRFS_BLOCK_GROUP_RAID1 | 7835 BTRFS_BLOCK_GROUP_RAID1 |
7836 BTRFS_BLOCK_GROUP_RAID5 |
7837 BTRFS_BLOCK_GROUP_RAID6 |
7811 BTRFS_BLOCK_GROUP_DUP))) 7838 BTRFS_BLOCK_GROUP_DUP)))
7812 continue; 7839 continue;
7813 /* 7840 /*
@@ -7883,6 +7910,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7910 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7884 cache->sectorsize = root->sectorsize; 7911 cache->sectorsize = root->sectorsize;
7885 cache->fs_info = root->fs_info; 7912 cache->fs_info = root->fs_info;
7913 cache->full_stripe_len = btrfs_full_stripe_len(root,
7914 &root->fs_info->mapping_tree,
7915 chunk_offset);
7886 7916
7887 atomic_set(&cache->count, 1); 7917 atomic_set(&cache->count, 1);
7888 spin_lock_init(&cache->lock); 7918 spin_lock_init(&cache->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 86ecca48c604..3b9fb478b0d1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1895 if (ret)
1896 err = ret; 1896 err = ret;
1897 1897
1898 if (did_repair) { 1898 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1899 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1900 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1901 if (ret && !err)
1902 if (ret && !err) 1902 err = ret;
1903 err = ret;
1904 }
1905 1903
1906 kfree(rec); 1904 kfree(rec);
1907 return err; 1905 return err;
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1930 u64 map_length = 0;
1933 u64 sector; 1931 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1932 struct btrfs_bio *bbio = NULL;
1933 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1934 int ret;
1936 1935
1937 BUG_ON(!mirror_num); 1936 BUG_ON(!mirror_num);
1938 1937
1938 /* we can't repair anything in raid56 yet */
1939 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1940 return 0;
1941
1939 bio = bio_alloc(GFP_NOFS, 1); 1942 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1943 if (!bio)
1941 return -EIO; 1944 return -EIO;
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2055 failrec->failed_mirror);
2053 did_repair = !ret; 2056 did_repair = !ret;
2054 } 2057 }
2058 ret = 0;
2055 } 2059 }
2056 2060
2057out: 2061out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..62020b7f7036 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1463,10 +1463,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1463} 1463}
1464 1464
1465static struct btrfs_free_space * 1465static struct btrfs_free_space *
1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1467 unsigned long align)
1467{ 1468{
1468 struct btrfs_free_space *entry; 1469 struct btrfs_free_space *entry;
1469 struct rb_node *node; 1470 struct rb_node *node;
1471 u64 ctl_off;
1472 u64 tmp;
1473 u64 align_off;
1470 int ret; 1474 int ret;
1471 1475
1472 if (!ctl->free_space_offset.rb_node) 1476 if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1485,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1481 if (entry->bytes < *bytes) 1485 if (entry->bytes < *bytes)
1482 continue; 1486 continue;
1483 1487
1488 /* make sure the space returned is big enough
1489 * to match our requested alignment
1490 */
1491 if (*bytes >= align) {
1492 ctl_off = entry->offset - ctl->start;
1493 tmp = ctl_off + align - 1;;
1494 do_div(tmp, align);
1495 tmp = tmp * align + ctl->start;
1496 align_off = tmp - entry->offset;
1497 } else {
1498 align_off = 0;
1499 tmp = entry->offset;
1500 }
1501
1502 if (entry->bytes < *bytes + align_off)
1503 continue;
1504
1484 if (entry->bitmap) { 1505 if (entry->bitmap) {
1485 ret = search_bitmap(ctl, entry, offset, bytes); 1506 ret = search_bitmap(ctl, entry, &tmp, bytes);
1486 if (!ret) 1507 if (!ret) {
1508 *offset = tmp;
1487 return entry; 1509 return entry;
1510 }
1488 continue; 1511 continue;
1489 } 1512 }
1490 1513
1491 *offset = entry->offset; 1514 *offset = tmp;
1492 *bytes = entry->bytes; 1515 *bytes = entry->bytes - align_off;
1493 return entry; 1516 return entry;
1494 } 1517 }
1495 1518
@@ -2091,9 +2114,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2091 struct btrfs_free_space *entry = NULL; 2114 struct btrfs_free_space *entry = NULL;
2092 u64 bytes_search = bytes + empty_size; 2115 u64 bytes_search = bytes + empty_size;
2093 u64 ret = 0; 2116 u64 ret = 0;
2117 u64 align_gap = 0;
2118 u64 align_gap_len = 0;
2094 2119
2095 spin_lock(&ctl->tree_lock); 2120 spin_lock(&ctl->tree_lock);
2096 entry = find_free_space(ctl, &offset, &bytes_search); 2121 entry = find_free_space(ctl, &offset, &bytes_search,
2122 block_group->full_stripe_len);
2097 if (!entry) 2123 if (!entry)
2098 goto out; 2124 goto out;
2099 2125
@@ -2103,9 +2129,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2103 if (!entry->bytes) 2129 if (!entry->bytes)
2104 free_bitmap(ctl, entry); 2130 free_bitmap(ctl, entry);
2105 } else { 2131 } else {
2132
2106 unlink_free_space(ctl, entry); 2133 unlink_free_space(ctl, entry);
2107 entry->offset += bytes; 2134 align_gap_len = offset - entry->offset;
2108 entry->bytes -= bytes; 2135 align_gap = entry->offset;
2136
2137 entry->offset = offset + bytes;
2138 WARN_ON(entry->bytes < bytes + align_gap_len);
2139
2140 entry->bytes -= bytes + align_gap_len;
2109 if (!entry->bytes) 2141 if (!entry->bytes)
2110 kmem_cache_free(btrfs_free_space_cachep, entry); 2142 kmem_cache_free(btrfs_free_space_cachep, entry);
2111 else 2143 else
@@ -2115,6 +2147,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2115out: 2147out:
2116 spin_unlock(&ctl->tree_lock); 2148 spin_unlock(&ctl->tree_lock);
2117 2149
2150 if (align_gap_len)
2151 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2118 return ret; 2152 return ret;
2119} 2153}
2120 2154
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1b98c4ce3c6f..6f4e41dca970 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,6 +39,7 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/blkdev.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -6386,19 +6387,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6386 int async_submit = 0; 6387 int async_submit = 0;
6387 6388
6388 map_length = orig_bio->bi_size; 6389 map_length = orig_bio->bi_size;
6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6390 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6390 &map_length, NULL, 0); 6391 &map_length, NULL, 0);
6391 if (ret) { 6392 if (ret) {
6392 bio_put(orig_bio); 6393 bio_put(orig_bio);
6393 return -EIO; 6394 return -EIO;
6394 } 6395 }
6395
6396 if (map_length >= orig_bio->bi_size) { 6396 if (map_length >= orig_bio->bi_size) {
6397 bio = orig_bio; 6397 bio = orig_bio;
6398 goto submit; 6398 goto submit;
6399 } 6399 }
6400 6400
6401 async_submit = 1; 6401 /* async crcs make it difficult to collect full stripe writes. */
6402 if (btrfs_get_alloc_profile(root, 1) &
6403 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
6404 async_submit = 0;
6405 else
6406 async_submit = 1;
6407
6402 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6408 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6403 if (!bio) 6409 if (!bio)
6404 return -ENOMEM; 6410 return -ENOMEM;
@@ -6440,7 +6446,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6440 bio->bi_end_io = btrfs_end_dio_bio; 6446 bio->bi_end_io = btrfs_end_dio_bio;
6441 6447
6442 map_length = orig_bio->bi_size; 6448 map_length = orig_bio->bi_size;
6443 ret = btrfs_map_block(root->fs_info, READ, 6449 ret = btrfs_map_block(root->fs_info, rw,
6444 start_sector << 9, 6450 start_sector << 9,
6445 &map_length, NULL, 0); 6451 &map_length, NULL, 0);
6446 if (ret) { 6452 if (ret) {
@@ -6583,15 +6589,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6583{ 6589{
6584 struct file *file = iocb->ki_filp; 6590 struct file *file = iocb->ki_filp;
6585 struct inode *inode = file->f_mapping->host; 6591 struct inode *inode = file->f_mapping->host;
6592 ssize_t ret;
6586 6593
6587 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6594 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6588 offset, nr_segs)) 6595 offset, nr_segs))
6589 return 0; 6596 return 0;
6590 6597
6591 return __blockdev_direct_IO(rw, iocb, inode, 6598 ret = __blockdev_direct_IO(rw, iocb, inode,
6592 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6599 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6593 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6600 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6594 btrfs_submit_direct, 0); 6601 btrfs_submit_direct, 0);
6602 return ret;
6595} 6603}
6596 6604
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 6605#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..d02510f34936
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,1647 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50struct btrfs_raid_bio {
51 struct btrfs_fs_info *fs_info;
52 struct btrfs_bio *bbio;
53
54 /*
55 * logical block numbers for the start of each stripe
56 * The last one or two are p/q. These are sorted,
57 * so raid_map[0] is the start of our full stripe
58 */
59 u64 *raid_map;
60
61 /* while we're doing rmw on a stripe
62 * we put it into a hash table so we can
63 * lock the stripe and merge more rbios
64 * into it.
65 */
66 struct list_head hash_list;
67
68 /*
69 * for scheduling work in the helper threads
70 */
71 struct btrfs_work work;
72
73 /*
74 * bio list and bio_list_lock are used
75 * to add more bios into the stripe
76 * in hopes of avoiding the full rmw
77 */
78 struct bio_list bio_list;
79 spinlock_t bio_list_lock;
80
81 /*
82 * also protected by the bio_list_lock, the
83 * stripe locking code uses plug_list to hand off
84 * the stripe lock to the next pending IO
85 */
86 struct list_head plug_list;
87
88 /*
89 * flags that tell us if it is safe to
90 * merge with this bio
91 */
92 unsigned long flags;
93
94 /* size of each individual stripe on disk */
95 int stripe_len;
96
97 /* number of data stripes (no p/q) */
98 int nr_data;
99
100 /*
101 * set if we're doing a parity rebuild
102 * for a read from higher up, which is handled
103 * differently from a parity rebuild as part of
104 * rmw
105 */
106 int read_rebuild;
107
108 /* first bad stripe */
109 int faila;
110
111 /* second bad stripe (for raid6 use) */
112 int failb;
113
114 /*
115 * number of pages needed to represent the full
116 * stripe
117 */
118 int nr_pages;
119
120 /*
121 * size of all the bios in the bio_list. This
122 * helps us decide if the rbio maps to a full
123 * stripe or not
124 */
125 int bio_list_bytes;
126
127 atomic_t refs;
128
129 /*
130 * these are two arrays of pointers. We allocate the
131 * rbio big enough to hold them both and setup their
132 * locations when the rbio is allocated
133 */
134
135 /* pointers to pages that we allocated for
136 * reading/writing stripes directly from the disk (including P/Q)
137 */
138 struct page **stripe_pages;
139
140 /*
141 * pointers to the pages in the bio_list. Stored
142 * here for faster lookup
143 */
144 struct page **bio_pages;
145};
146
147static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
148static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
149static void rmw_work(struct btrfs_work *work);
150static void read_rebuild_work(struct btrfs_work *work);
151static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
152static void async_read_rebuild(struct btrfs_raid_bio *rbio);
153static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
154static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
155static void __free_raid_bio(struct btrfs_raid_bio *rbio);
156static void index_rbio_pages(struct btrfs_raid_bio *rbio);
157static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
158
159/*
160 * the stripe hash table is used for locking, and to collect
161 * bios in hopes of making a full stripe
162 */
163int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
164{
165 struct btrfs_stripe_hash_table *table;
166 struct btrfs_stripe_hash_table *x;
167 struct btrfs_stripe_hash *cur;
168 struct btrfs_stripe_hash *h;
169 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
170 int i;
171
172 if (info->stripe_hash_table)
173 return 0;
174
175 table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
176 if (!table)
177 return -ENOMEM;
178
179 table->table = (void *)(table + 1);
180 h = table->table;
181
182 for (i = 0; i < num_entries; i++) {
183 cur = h + i;
184 INIT_LIST_HEAD(&cur->hash_list);
185 spin_lock_init(&cur->lock);
186 init_waitqueue_head(&cur->wait);
187 }
188
189 x = cmpxchg(&info->stripe_hash_table, NULL, table);
190 if (x)
191 kfree(x);
192 return 0;
193}
194
195/*
196 * we hash on the first logical address of the stripe
197 */
198static int rbio_bucket(struct btrfs_raid_bio *rbio)
199{
200 u64 num = rbio->raid_map[0];
201
202 /*
203 * we shift down quite a bit. We're using byte
204 * addressing, and most of the lower bits are zeros.
205 * This tends to upset hash_64, and it consistently
206 * returns just one or two different values.
207 *
208 * shifting off the lower bits fixes things.
209 */
210 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
211}
212
213/*
214 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should
216 * be discarded afterwards.
217 *
218 * must be called with dest->rbio_list_lock held
219 */
220static void merge_rbio(struct btrfs_raid_bio *dest,
221 struct btrfs_raid_bio *victim)
222{
223 bio_list_merge(&dest->bio_list, &victim->bio_list);
224 dest->bio_list_bytes += victim->bio_list_bytes;
225 bio_list_init(&victim->bio_list);
226}
227
228/*
229 * free the hash table used by unmount
230 */
231void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232{
233 if (!info->stripe_hash_table)
234 return;
235 kfree(info->stripe_hash_table);
236 info->stripe_hash_table = NULL;
237}
238
239/*
240 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to
242 * loop through.
243 */
244static void run_xor(void **pages, int src_cnt, ssize_t len)
245{
246 int src_off = 0;
247 int xor_src_cnt = 0;
248 void *dest = pages[src_cnt];
249
250 while(src_cnt > 0) {
251 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
252 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
253
254 src_cnt -= xor_src_cnt;
255 src_off += xor_src_cnt;
256 }
257}
258
259/*
260 * returns true if the bio list inside this rbio
261 * covers an entire stripe (no rmw required).
262 * Must be called with the bio list lock held, or
263 * at a time when you know it is impossible to add
264 * new bios into the list
265 */
266static int __rbio_is_full(struct btrfs_raid_bio *rbio)
267{
268 unsigned long size = rbio->bio_list_bytes;
269 int ret = 1;
270
271 if (size != rbio->nr_data * rbio->stripe_len)
272 ret = 0;
273
274 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
275 return ret;
276}
277
278static int rbio_is_full(struct btrfs_raid_bio *rbio)
279{
280 unsigned long flags;
281 int ret;
282
283 spin_lock_irqsave(&rbio->bio_list_lock, flags);
284 ret = __rbio_is_full(rbio);
285 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
286 return ret;
287}
288
289/*
290 * returns 1 if it is safe to merge two rbios together.
291 * The merging is safe if the two rbios correspond to
292 * the same stripe and if they are both going in the same
293 * direction (read vs write), and if neither one is
294 * locked for final IO
295 *
296 * The caller is responsible for locking such that
297 * rmw_locked is safe to test
298 */
299static int rbio_can_merge(struct btrfs_raid_bio *last,
300 struct btrfs_raid_bio *cur)
301{
302 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
303 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304 return 0;
305
306 if (last->raid_map[0] !=
307 cur->raid_map[0])
308 return 0;
309
310 /* reads can't merge with writes */
311 if (last->read_rebuild !=
312 cur->read_rebuild) {
313 return 0;
314 }
315
316 return 1;
317}
318
319/*
320 * helper to index into the pstripe
321 */
322static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
323{
324 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
325 return rbio->stripe_pages[index];
326}
327
328/*
329 * helper to index into the qstripe, returns null
330 * if there is no qstripe
331 */
332static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
333{
334 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
335 return NULL;
336
337 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
338 PAGE_CACHE_SHIFT;
339 return rbio->stripe_pages[index];
340}
341
342/*
343 * The first stripe in the table for a logical address
344 * has the lock. rbios are added in one of three ways:
345 *
346 * 1) Nobody has the stripe locked yet. The rbio is given
347 * the lock and 0 is returned. The caller must start the IO
348 * themselves.
349 *
350 * 2) Someone has the stripe locked, but we're able to merge
351 * with the lock owner. The rbio is freed and the IO will
352 * start automatically along with the existing rbio. 1 is returned.
353 *
354 * 3) Someone has the stripe locked, but we're not able to merge.
355 * The rbio is added to the lock owner's plug list, or merged into
356 * an rbio already on the plug list. When the lock owner unlocks,
357 * the next rbio on the list is run and the IO is started automatically.
358 * 1 is returned
359 *
360 * If we return 0, the caller still owns the rbio and must continue with
361 * IO submission. If we return 1, the caller must assume the rbio has
362 * already been freed.
363 */
364static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
365{
366 int bucket = rbio_bucket(rbio);
367 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
368 struct btrfs_raid_bio *cur;
369 struct btrfs_raid_bio *pending;
370 unsigned long flags;
371 DEFINE_WAIT(wait);
372 struct btrfs_raid_bio *freeit = NULL;
373 int ret = 0;
374 int walk = 0;
375
376 spin_lock_irqsave(&h->lock, flags);
377 list_for_each_entry(cur, &h->hash_list, hash_list) {
378 walk++;
379 if (cur->raid_map[0] == rbio->raid_map[0]) {
380 spin_lock(&cur->bio_list_lock);
381
382 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur, rbio)) {
384 merge_rbio(cur, rbio);
385 spin_unlock(&cur->bio_list_lock);
386 freeit = rbio;
387 ret = 1;
388 goto out;
389 }
390
391 /*
392 * we couldn't merge with the running
393 * rbio, see if we can merge with the
394 * pending ones. We don't have to
395 * check for rmw_locked because there
396 * is no way they are inside finish_rmw
397 * right now
398 */
399 list_for_each_entry(pending, &cur->plug_list,
400 plug_list) {
401 if (rbio_can_merge(pending, rbio)) {
402 merge_rbio(pending, rbio);
403 spin_unlock(&cur->bio_list_lock);
404 freeit = rbio;
405 ret = 1;
406 goto out;
407 }
408 }
409
410 /* no merging, put us on the tail of the plug list,
411 * our rbio will be started with the currently
412 * running rbio unlocks
413 */
414 list_add_tail(&rbio->plug_list, &cur->plug_list);
415 spin_unlock(&cur->bio_list_lock);
416 ret = 1;
417 goto out;
418 }
419 }
420
421 atomic_inc(&rbio->refs);
422 list_add(&rbio->hash_list, &h->hash_list);
423out:
424 spin_unlock_irqrestore(&h->lock, flags);
425 if (freeit)
426 __free_raid_bio(freeit);
427 return ret;
428}
429
430/*
431 * called as rmw or parity rebuild is completed. If the plug list has more
432 * rbios waiting for this stripe, the next one on the list will be started
433 */
434static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
435{
436 int bucket;
437 struct btrfs_stripe_hash *h;
438 unsigned long flags;
439
440 bucket = rbio_bucket(rbio);
441 h = rbio->fs_info->stripe_hash_table->table + bucket;
442
443 spin_lock_irqsave(&h->lock, flags);
444 spin_lock(&rbio->bio_list_lock);
445
446 if (!list_empty(&rbio->hash_list)) {
447
448 list_del_init(&rbio->hash_list);
449 atomic_dec(&rbio->refs);
450
451 /*
452 * we use the plug list to hold all the rbios
453 * waiting for the chance to lock this stripe.
454 * hand the lock over to one of them.
455 */
456 if (!list_empty(&rbio->plug_list)) {
457 struct btrfs_raid_bio *next;
458 struct list_head *head = rbio->plug_list.next;
459
460 next = list_entry(head, struct btrfs_raid_bio,
461 plug_list);
462
463 list_del_init(&rbio->plug_list);
464
465 list_add(&next->hash_list, &h->hash_list);
466 atomic_inc(&next->refs);
467 spin_unlock(&rbio->bio_list_lock);
468 spin_unlock_irqrestore(&h->lock, flags);
469
470 if (next->read_rebuild)
471 async_read_rebuild(next);
472 else
473 async_rmw_stripe(next);
474
475 goto done_nolock;
476
477 } else if (waitqueue_active(&h->wait)) {
478 spin_unlock(&rbio->bio_list_lock);
479 spin_unlock_irqrestore(&h->lock, flags);
480 wake_up(&h->wait);
481 goto done_nolock;
482 }
483 }
484 spin_unlock(&rbio->bio_list_lock);
485 spin_unlock_irqrestore(&h->lock, flags);
486
487done_nolock:
488 return;
489}
490
491static void __free_raid_bio(struct btrfs_raid_bio *rbio)
492{
493 int i;
494
495 WARN_ON(atomic_read(&rbio->refs) < 0);
496 if (!atomic_dec_and_test(&rbio->refs))
497 return;
498
499 WARN_ON(!list_empty(&rbio->hash_list));
500 WARN_ON(!bio_list_empty(&rbio->bio_list));
501
502 for (i = 0; i < rbio->nr_pages; i++) {
503 if (rbio->stripe_pages[i]) {
504 __free_page(rbio->stripe_pages[i]);
505 rbio->stripe_pages[i] = NULL;
506 }
507 }
508 kfree(rbio->raid_map);
509 kfree(rbio->bbio);
510 kfree(rbio);
511}
512
513static void free_raid_bio(struct btrfs_raid_bio *rbio)
514{
515 unlock_stripe(rbio);
516 __free_raid_bio(rbio);
517}
518
519/*
520 * this frees the rbio and runs through all the bios in the
521 * bio_list and calls end_io on them
522 */
523static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
524{
525 struct bio *cur = bio_list_get(&rbio->bio_list);
526 struct bio *next;
527 free_raid_bio(rbio);
528
529 while (cur) {
530 next = cur->bi_next;
531 cur->bi_next = NULL;
532 if (uptodate)
533 set_bit(BIO_UPTODATE, &cur->bi_flags);
534 bio_endio(cur, err);
535 cur = next;
536 }
537}
538
539/*
540 * end io function used by finish_rmw. When we finally
541 * get here, we've written a full stripe
542 */
543static void raid_write_end_io(struct bio *bio, int err)
544{
545 struct btrfs_raid_bio *rbio = bio->bi_private;
546
547 if (err)
548 fail_bio_stripe(rbio, bio);
549
550 bio_put(bio);
551
552 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
553 return;
554
555 err = 0;
556
557 /* OK, we have read all the stripes we need to. */
558 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
559 err = -EIO;
560
561 rbio_orig_end_io(rbio, err, 0);
562 return;
563}
564
565/*
566 * the read/modify/write code wants to use the original bio for
567 * any pages it included, and then use the rbio for everything
568 * else. This function decides if a given index (stripe number)
569 * and page number in that stripe fall inside the original bio
570 * or the rbio.
571 *
572 * if you set bio_list_only, you'll get a NULL back for any ranges
573 * that are outside the bio_list
574 *
575 * This doesn't take any refs on anything, you get a bare page pointer
576 * and the caller must bump refs as required.
577 *
578 * You must call index_rbio_pages once before you can trust
579 * the answers from this function.
580 */
581static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
582 int index, int pagenr, int bio_list_only)
583{
584 int chunk_page;
585 struct page *p = NULL;
586
587 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
588
589 spin_lock_irq(&rbio->bio_list_lock);
590 p = rbio->bio_pages[chunk_page];
591 spin_unlock_irq(&rbio->bio_list_lock);
592
593 if (p || bio_list_only)
594 return p;
595
596 return rbio->stripe_pages[chunk_page];
597}
598
599/*
600 * number of pages we need for the entire stripe across all the
601 * drives
602 */
603static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
604{
605 unsigned long nr = stripe_len * nr_stripes;
606 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
607}
608
609/*
610 * allocation and initial setup for the btrfs_raid_bio. Not
611 * this does not allocate any pages for rbio->pages.
612 */
613static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
614 struct btrfs_bio *bbio, u64 *raid_map,
615 u64 stripe_len)
616{
617 struct btrfs_raid_bio *rbio;
618 int nr_data = 0;
619 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
620 void *p;
621
622 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
623 GFP_NOFS);
624 if (!rbio) {
625 kfree(raid_map);
626 kfree(bbio);
627 return ERR_PTR(-ENOMEM);
628 }
629
630 bio_list_init(&rbio->bio_list);
631 INIT_LIST_HEAD(&rbio->plug_list);
632 spin_lock_init(&rbio->bio_list_lock);
633 INIT_LIST_HEAD(&rbio->hash_list);
634 rbio->bbio = bbio;
635 rbio->raid_map = raid_map;
636 rbio->fs_info = root->fs_info;
637 rbio->stripe_len = stripe_len;
638 rbio->nr_pages = num_pages;
639 rbio->faila = -1;
640 rbio->failb = -1;
641 atomic_set(&rbio->refs, 1);
642
643 /*
644 * the stripe_pages and bio_pages array point to the extra
645 * memory we allocated past the end of the rbio
646 */
647 p = rbio + 1;
648 rbio->stripe_pages = p;
649 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
650
651 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
652 nr_data = bbio->num_stripes - 2;
653 else
654 nr_data = bbio->num_stripes - 1;
655
656 rbio->nr_data = nr_data;
657 return rbio;
658}
659
660/* allocate pages for all the stripes in the bio, including parity */
661static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
662{
663 int i;
664 struct page *page;
665
666 for (i = 0; i < rbio->nr_pages; i++) {
667 if (rbio->stripe_pages[i])
668 continue;
669 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
670 if (!page)
671 return -ENOMEM;
672 rbio->stripe_pages[i] = page;
673 ClearPageUptodate(page);
674 }
675 return 0;
676}
677
678/* allocate pages for just the p/q stripes */
679static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
680{
681 int i;
682 struct page *page;
683
684 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
685
686 for (; i < rbio->nr_pages; i++) {
687 if (rbio->stripe_pages[i])
688 continue;
689 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
690 if (!page)
691 return -ENOMEM;
692 rbio->stripe_pages[i] = page;
693 }
694 return 0;
695}
696
697/*
698 * add a single page from a specific stripe into our list of bios for IO
699 * this will try to merge into existing bios if possible, and returns
700 * zero if all went well.
701 */
702int rbio_add_io_page(struct btrfs_raid_bio *rbio,
703 struct bio_list *bio_list,
704 struct page *page,
705 int stripe_nr,
706 unsigned long page_index,
707 unsigned long bio_max_len)
708{
709 struct bio *last = bio_list->tail;
710 u64 last_end = 0;
711 int ret;
712 struct bio *bio;
713 struct btrfs_bio_stripe *stripe;
714 u64 disk_start;
715
716 stripe = &rbio->bbio->stripes[stripe_nr];
717 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
718
719 /* if the device is missing, just fail this stripe */
720 if (!stripe->dev->bdev)
721 return fail_rbio_index(rbio, stripe_nr);
722
723 /* see if we can add this page onto our existing bio */
724 if (last) {
725 last_end = (u64)last->bi_sector << 9;
726 last_end += last->bi_size;
727
728 /*
729 * we can't merge these if they are from different
730 * devices or if they are not contiguous
731 */
732 if (last_end == disk_start && stripe->dev->bdev &&
733 test_bit(BIO_UPTODATE, &last->bi_flags) &&
734 last->bi_bdev == stripe->dev->bdev) {
735 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
736 if (ret == PAGE_CACHE_SIZE)
737 return 0;
738 }
739 }
740
741 /* put a new bio on the list */
742 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
743 if (!bio)
744 return -ENOMEM;
745
746 bio->bi_size = 0;
747 bio->bi_bdev = stripe->dev->bdev;
748 bio->bi_sector = disk_start >> 9;
749 set_bit(BIO_UPTODATE, &bio->bi_flags);
750
751 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
752 bio_list_add(bio_list, bio);
753 return 0;
754}
755
756/*
757 * while we're doing the read/modify/write cycle, we could
758 * have errors in reading pages off the disk. This checks
759 * for errors and if we're not able to read the page it'll
760 * trigger parity reconstruction. The rmw will be finished
761 * after we've reconstructed the failed stripes
762 */
763static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
764{
765 if (rbio->faila >= 0 || rbio->failb >= 0) {
766 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
767 __raid56_parity_recover(rbio);
768 } else {
769 finish_rmw(rbio);
770 }
771}
772
773/*
774 * these are just the pages from the rbio array, not from anything
775 * the FS sent down to us
776 */
777static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
778{
779 int index;
780 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
781 index += page;
782 return rbio->stripe_pages[index];
783}
784
785/*
786 * helper function to walk our bio list and populate the bio_pages array with
787 * the result. This seems expensive, but it is faster than constantly
788 * searching through the bio list as we setup the IO in finish_rmw or stripe
789 * reconstruction.
790 *
791 * This must be called before you trust the answers from page_in_rbio
792 */
793static void index_rbio_pages(struct btrfs_raid_bio *rbio)
794{
795 struct bio *bio;
796 u64 start;
797 unsigned long stripe_offset;
798 unsigned long page_index;
799 struct page *p;
800 int i;
801
802 spin_lock_irq(&rbio->bio_list_lock);
803 bio_list_for_each(bio, &rbio->bio_list) {
804 start = (u64)bio->bi_sector << 9;
805 stripe_offset = start - rbio->raid_map[0];
806 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
807
808 for (i = 0; i < bio->bi_vcnt; i++) {
809 p = bio->bi_io_vec[i].bv_page;
810 rbio->bio_pages[page_index + i] = p;
811 }
812 }
813 spin_unlock_irq(&rbio->bio_list_lock);
814}
815
816/*
817 * this is called from one of two situations. We either
818 * have a full stripe from the higher layers, or we've read all
819 * the missing bits off disk.
820 *
821 * This will calculate the parity and then send down any
822 * changed blocks.
823 */
824static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
825{
826 struct btrfs_bio *bbio = rbio->bbio;
827 void *pointers[bbio->num_stripes];
828 int stripe_len = rbio->stripe_len;
829 int nr_data = rbio->nr_data;
830 int stripe;
831 int pagenr;
832 int p_stripe = -1;
833 int q_stripe = -1;
834 struct bio_list bio_list;
835 struct bio *bio;
836 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
837 int ret;
838
839 bio_list_init(&bio_list);
840
841 if (bbio->num_stripes - rbio->nr_data == 1) {
842 p_stripe = bbio->num_stripes - 1;
843 } else if (bbio->num_stripes - rbio->nr_data == 2) {
844 p_stripe = bbio->num_stripes - 2;
845 q_stripe = bbio->num_stripes - 1;
846 } else {
847 BUG();
848 }
849
850 /* at this point we either have a full stripe,
851 * or we've read the full stripe from the drive.
852 * recalculate the parity and write the new results.
853 *
854 * We're not allowed to add any new bios to the
855 * bio list here, anyone else that wants to
856 * change this stripe needs to do their own rmw.
857 */
858 spin_lock_irq(&rbio->bio_list_lock);
859 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
860 spin_unlock_irq(&rbio->bio_list_lock);
861
862 atomic_set(&rbio->bbio->error, 0);
863
864 /*
865 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers
867 */
868 index_rbio_pages(rbio);
869
870 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871 struct page *p;
872 /* first collect one page from each data stripe */
873 for (stripe = 0; stripe < nr_data; stripe++) {
874 p = page_in_rbio(rbio, stripe, pagenr, 0);
875 pointers[stripe] = kmap(p);
876 }
877
878 /* then add the parity stripe */
879 p = rbio_pstripe_page(rbio, pagenr);
880 SetPageUptodate(p);
881 pointers[stripe++] = kmap(p);
882
883 if (q_stripe != -1) {
884
885 /*
886 * raid6, add the qstripe and call the
887 * library function to fill in our p/q
888 */
889 p = rbio_qstripe_page(rbio, pagenr);
890 SetPageUptodate(p);
891 pointers[stripe++] = kmap(p);
892
893 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
894 pointers);
895 } else {
896 /* raid5 */
897 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
898 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
899 }
900
901
902 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
903 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
904 }
905
906 /*
907 * time to start writing. Make bios for everything from the
908 * higher layers (the bio_list in our rbio) and our p/q. Ignore
909 * everything else.
910 */
911 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
912 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
913 struct page *page;
914 if (stripe < rbio->nr_data) {
915 page = page_in_rbio(rbio, stripe, pagenr, 1);
916 if (!page)
917 continue;
918 } else {
919 page = rbio_stripe_page(rbio, stripe, pagenr);
920 }
921
922 ret = rbio_add_io_page(rbio, &bio_list,
923 page, stripe, pagenr, rbio->stripe_len);
924 if (ret)
925 goto cleanup;
926 }
927 }
928
929 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
930 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
931
932 while (1) {
933 bio = bio_list_pop(&bio_list);
934 if (!bio)
935 break;
936
937 bio->bi_private = rbio;
938 bio->bi_end_io = raid_write_end_io;
939 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
940 submit_bio(WRITE, bio);
941 }
942 return;
943
944cleanup:
945 rbio_orig_end_io(rbio, -EIO, 0);
946}
947
948/*
949 * helper to find the stripe number for a given bio. Used to figure out which
950 * stripe has failed. This expects the bio to correspond to a physical disk,
951 * so it looks up based on physical sector numbers.
952 */
953static int find_bio_stripe(struct btrfs_raid_bio *rbio,
954 struct bio *bio)
955{
956 u64 physical = bio->bi_sector;
957 u64 stripe_start;
958 int i;
959 struct btrfs_bio_stripe *stripe;
960
961 physical <<= 9;
962
963 for (i = 0; i < rbio->bbio->num_stripes; i++) {
964 stripe = &rbio->bbio->stripes[i];
965 stripe_start = stripe->physical;
966 if (physical >= stripe_start &&
967 physical < stripe_start + rbio->stripe_len) {
968 return i;
969 }
970 }
971 return -1;
972}
973
974/*
975 * helper to find the stripe number for a given
976 * bio (before mapping). Used to figure out which stripe has
977 * failed. This looks up based on logical block numbers.
978 */
979static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
980 struct bio *bio)
981{
982 u64 logical = bio->bi_sector;
983 u64 stripe_start;
984 int i;
985
986 logical <<= 9;
987
988 for (i = 0; i < rbio->nr_data; i++) {
989 stripe_start = rbio->raid_map[i];
990 if (logical >= stripe_start &&
991 logical < stripe_start + rbio->stripe_len) {
992 return i;
993 }
994 }
995 return -1;
996}
997
998/*
999 * returns -EIO if we had too many failures
1000 */
1001static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002{
1003 unsigned long flags;
1004 int ret = 0;
1005
1006 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007
1008 /* we already know this stripe is bad, move on */
1009 if (rbio->faila == failed || rbio->failb == failed)
1010 goto out;
1011
1012 if (rbio->faila == -1) {
1013 /* first failure on this rbio */
1014 rbio->faila = failed;
1015 atomic_inc(&rbio->bbio->error);
1016 } else if (rbio->failb == -1) {
1017 /* second failure on this rbio */
1018 rbio->failb = failed;
1019 atomic_inc(&rbio->bbio->error);
1020 } else {
1021 ret = -EIO;
1022 }
1023out:
1024 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025
1026 return ret;
1027}
1028
1029/*
1030 * helper to fail a stripe based on a physical disk
1031 * bio.
1032 */
1033static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034 struct bio *bio)
1035{
1036 int failed = find_bio_stripe(rbio, bio);
1037
1038 if (failed < 0)
1039 return -EIO;
1040
1041 return fail_rbio_index(rbio, failed);
1042}
1043
1044/*
1045 * this sets each page in the bio uptodate. It should only be used on private
1046 * rbio pages, nothing that comes in from the higher layers
1047 */
1048static void set_bio_pages_uptodate(struct bio *bio)
1049{
1050 int i;
1051 struct page *p;
1052
1053 for (i = 0; i < bio->bi_vcnt; i++) {
1054 p = bio->bi_io_vec[i].bv_page;
1055 SetPageUptodate(p);
1056 }
1057}
1058
1059/*
1060 * end io for the read phase of the rmw cycle. All the bios here are physical
1061 * stripe bios we've read from the disk so we can recalculate the parity of the
1062 * stripe.
1063 *
1064 * This will usually kick off finish_rmw once all the bios are read in, but it
1065 * may trigger parity reconstruction if we had any errors along the way
1066 */
1067static void raid_rmw_end_io(struct bio *bio, int err)
1068{
1069 struct btrfs_raid_bio *rbio = bio->bi_private;
1070
1071 if (err)
1072 fail_bio_stripe(rbio, bio);
1073 else
1074 set_bio_pages_uptodate(bio);
1075
1076 bio_put(bio);
1077
1078 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079 return;
1080
1081 err = 0;
1082 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083 goto cleanup;
1084
1085 /*
1086 * this will normally call finish_rmw to start our write
1087 * but if there are any failed stripes we'll reconstruct
1088 * from parity first
1089 */
1090 validate_rbio_for_rmw(rbio);
1091 return;
1092
1093cleanup:
1094
1095 rbio_orig_end_io(rbio, -EIO, 0);
1096}
1097
1098static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099{
1100 rbio->work.flags = 0;
1101 rbio->work.func = rmw_work;
1102
1103 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104 &rbio->work);
1105}
1106
1107static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108{
1109 rbio->work.flags = 0;
1110 rbio->work.func = read_rebuild_work;
1111
1112 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113 &rbio->work);
1114}
1115
1116/*
1117 * the stripe must be locked by the caller. It will
1118 * unlock after all the writes are done
1119 */
1120static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121{
1122 int bios_to_read = 0;
1123 struct btrfs_bio *bbio = rbio->bbio;
1124 struct bio_list bio_list;
1125 int ret;
1126 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127 int pagenr;
1128 int stripe;
1129 struct bio *bio;
1130
1131 bio_list_init(&bio_list);
1132
1133 ret = alloc_rbio_pages(rbio);
1134 if (ret)
1135 goto cleanup;
1136
1137 index_rbio_pages(rbio);
1138
1139 atomic_set(&rbio->bbio->error, 0);
1140 /*
1141 * build a list of bios to read all the missing parts of this
1142 * stripe
1143 */
1144 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146 struct page *page;
1147 /*
1148 * we want to find all the pages missing from
1149 * the rbio and read them from the disk. If
1150 * page_in_rbio finds a page in the bio list
1151 * we don't need to read it off the stripe.
1152 */
1153 page = page_in_rbio(rbio, stripe, pagenr, 1);
1154 if (page)
1155 continue;
1156
1157 page = rbio_stripe_page(rbio, stripe, pagenr);
1158 ret = rbio_add_io_page(rbio, &bio_list, page,
1159 stripe, pagenr, rbio->stripe_len);
1160 if (ret)
1161 goto cleanup;
1162 }
1163 }
1164
1165 bios_to_read = bio_list_size(&bio_list);
1166 if (!bios_to_read) {
1167 /*
1168 * this can happen if others have merged with
1169 * us, it means there is nothing left to read.
1170 * But if there are missing devices it may not be
1171 * safe to do the full stripe write yet.
1172 */
1173 goto finish;
1174 }
1175
1176 /*
1177 * the bbio may be freed once we submit the last bio. Make sure
1178 * not to touch it after that
1179 */
1180 atomic_set(&bbio->stripes_pending, bios_to_read);
1181 while (1) {
1182 bio = bio_list_pop(&bio_list);
1183 if (!bio)
1184 break;
1185
1186 bio->bi_private = rbio;
1187 bio->bi_end_io = raid_rmw_end_io;
1188
1189 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190 BTRFS_WQ_ENDIO_RAID56);
1191
1192 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193 submit_bio(READ, bio);
1194 }
1195 /* the actual write will happen once the reads are done */
1196 return 0;
1197
1198cleanup:
1199 rbio_orig_end_io(rbio, -EIO, 0);
1200 return -EIO;
1201
1202finish:
1203 validate_rbio_for_rmw(rbio);
1204 return 0;
1205}
1206
1207/*
1208 * if the upper layers pass in a full stripe, we thank them by only allocating
1209 * enough pages to hold the parity, and sending it all down quickly.
1210 */
1211static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212{
1213 int ret;
1214
1215 ret = alloc_rbio_parity_pages(rbio);
1216 if (ret)
1217 return ret;
1218
1219 ret = lock_stripe_add(rbio);
1220 if (ret == 0)
1221 finish_rmw(rbio);
1222 return 0;
1223}
1224
1225/*
1226 * partial stripe writes get handed over to async helpers.
1227 * We're really hoping to merge a few more writes into this
1228 * rbio before calculating new parity
1229 */
1230static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231{
1232 int ret;
1233
1234 ret = lock_stripe_add(rbio);
1235 if (ret == 0)
1236 async_rmw_stripe(rbio);
1237 return 0;
1238}
1239
1240/*
1241 * sometimes while we were reading from the drive to
1242 * recalculate parity, enough new bios come into create
1243 * a full stripe. So we do a check here to see if we can
1244 * go directly to finish_rmw
1245 */
1246static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247{
1248 /* head off into rmw land if we don't have a full stripe */
1249 if (!rbio_is_full(rbio))
1250 return partial_stripe_write(rbio);
1251 return full_stripe_write(rbio);
1252}
1253
1254/*
1255 * our main entry point for writes from the rest of the FS.
1256 */
1257int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258 struct btrfs_bio *bbio, u64 *raid_map,
1259 u64 stripe_len)
1260{
1261 struct btrfs_raid_bio *rbio;
1262
1263 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264 if (IS_ERR(rbio)) {
1265 kfree(raid_map);
1266 kfree(bbio);
1267 return PTR_ERR(rbio);
1268 }
1269 bio_list_add(&rbio->bio_list, bio);
1270 rbio->bio_list_bytes = bio->bi_size;
1271 return __raid56_parity_write(rbio);
1272}
1273
1274/*
1275 * all parity reconstruction happens here. We've read in everything
1276 * we can find from the drives and this does the heavy lifting of
1277 * sorting the good from the bad.
1278 */
1279static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280{
1281 int pagenr, stripe;
1282 void **pointers;
1283 int faila = -1, failb = -1;
1284 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285 struct page *page;
1286 int err;
1287 int i;
1288
1289 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290 GFP_NOFS);
1291 if (!pointers) {
1292 err = -ENOMEM;
1293 goto cleanup_io;
1294 }
1295
1296 faila = rbio->faila;
1297 failb = rbio->failb;
1298
1299 if (rbio->read_rebuild) {
1300 spin_lock_irq(&rbio->bio_list_lock);
1301 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302 spin_unlock_irq(&rbio->bio_list_lock);
1303 }
1304
1305 index_rbio_pages(rbio);
1306
1307 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308 /* setup our array of pointers with pages
1309 * from each stripe
1310 */
1311 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312 /*
1313 * if we're rebuilding a read, we have to use
1314 * pages from the bio list
1315 */
1316 if (rbio->read_rebuild &&
1317 (stripe == faila || stripe == failb)) {
1318 page = page_in_rbio(rbio, stripe, pagenr, 0);
1319 } else {
1320 page = rbio_stripe_page(rbio, stripe, pagenr);
1321 }
1322 pointers[stripe] = kmap(page);
1323 }
1324
1325 /* all raid6 handling here */
1326 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327 RAID6_Q_STRIPE) {
1328
1329 /*
1330 * single failure, rebuild from parity raid5
1331 * style
1332 */
1333 if (failb < 0) {
1334 if (faila == rbio->nr_data) {
1335 /*
1336 * Just the P stripe has failed, without
1337 * a bad data or Q stripe.
1338 * TODO, we should redo the xor here.
1339 */
1340 err = -EIO;
1341 goto cleanup;
1342 }
1343 /*
1344 * a single failure in raid6 is rebuilt
1345 * in the pstripe code below
1346 */
1347 goto pstripe;
1348 }
1349
1350 /* make sure our ps and qs are in order */
1351 if (faila > failb) {
1352 int tmp = failb;
1353 failb = faila;
1354 faila = tmp;
1355 }
1356
1357 /* if the q stripe is failed, do a pstripe reconstruction
1358 * from the xors.
1359 * If both the q stripe and the P stripe are failed, we're
1360 * here due to a crc mismatch and we can't give them the
1361 * data they want
1362 */
1363 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365 err = -EIO;
1366 goto cleanup;
1367 }
1368 /*
1369 * otherwise we have one bad data stripe and
1370 * a good P stripe. raid5!
1371 */
1372 goto pstripe;
1373 }
1374
1375 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376 raid6_datap_recov(rbio->bbio->num_stripes,
1377 PAGE_SIZE, faila, pointers);
1378 } else {
1379 raid6_2data_recov(rbio->bbio->num_stripes,
1380 PAGE_SIZE, faila, failb,
1381 pointers);
1382 }
1383 } else {
1384 void *p;
1385
1386 /* rebuild from P stripe here (raid5 or raid6) */
1387 BUG_ON(failb != -1);
1388pstripe:
1389 /* Copy parity block into failed block to start with */
1390 memcpy(pointers[faila],
1391 pointers[rbio->nr_data],
1392 PAGE_CACHE_SIZE);
1393
1394 /* rearrange the pointer array */
1395 p = pointers[faila];
1396 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397 pointers[stripe] = pointers[stripe + 1];
1398 pointers[rbio->nr_data - 1] = p;
1399
1400 /* xor in the rest */
1401 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402 }
1403 /* if we're doing this rebuild as part of an rmw, go through
1404 * and set all of our private rbio pages in the
1405 * failed stripes as uptodate. This way finish_rmw will
1406 * know they can be trusted. If this was a read reconstruction,
1407 * other endio functions will fiddle the uptodate bits
1408 */
1409 if (!rbio->read_rebuild) {
1410 for (i = 0; i < nr_pages; i++) {
1411 if (faila != -1) {
1412 page = rbio_stripe_page(rbio, faila, i);
1413 SetPageUptodate(page);
1414 }
1415 if (failb != -1) {
1416 page = rbio_stripe_page(rbio, failb, i);
1417 SetPageUptodate(page);
1418 }
1419 }
1420 }
1421 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422 /*
1423 * if we're rebuilding a read, we have to use
1424 * pages from the bio list
1425 */
1426 if (rbio->read_rebuild &&
1427 (stripe == faila || stripe == failb)) {
1428 page = page_in_rbio(rbio, stripe, pagenr, 0);
1429 } else {
1430 page = rbio_stripe_page(rbio, stripe, pagenr);
1431 }
1432 kunmap(page);
1433 }
1434 }
1435
1436 err = 0;
1437cleanup:
1438 kfree(pointers);
1439
1440cleanup_io:
1441
1442 if (rbio->read_rebuild) {
1443 rbio_orig_end_io(rbio, err, err == 0);
1444 } else if (err == 0) {
1445 rbio->faila = -1;
1446 rbio->failb = -1;
1447 finish_rmw(rbio);
1448 } else {
1449 rbio_orig_end_io(rbio, err, 0);
1450 }
1451}
1452
1453/*
1454 * This is called only for stripes we've read from disk to
1455 * reconstruct the parity.
1456 */
1457static void raid_recover_end_io(struct bio *bio, int err)
1458{
1459 struct btrfs_raid_bio *rbio = bio->bi_private;
1460
1461 /*
1462 * we only read stripe pages off the disk, set them
1463 * up to date if there were no errors
1464 */
1465 if (err)
1466 fail_bio_stripe(rbio, bio);
1467 else
1468 set_bio_pages_uptodate(bio);
1469 bio_put(bio);
1470
1471 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472 return;
1473
1474 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475 rbio_orig_end_io(rbio, -EIO, 0);
1476 else
1477 __raid_recover_end_io(rbio);
1478}
1479
1480/*
1481 * reads everything we need off the disk to reconstruct
1482 * the parity. endio handlers trigger final reconstruction
1483 * when the IO is done.
1484 *
1485 * This is used both for reads from the higher layers and for
1486 * parity construction required to finish a rmw cycle.
1487 */
1488static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489{
1490 int bios_to_read = 0;
1491 struct btrfs_bio *bbio = rbio->bbio;
1492 struct bio_list bio_list;
1493 int ret;
1494 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495 int pagenr;
1496 int stripe;
1497 struct bio *bio;
1498
1499 bio_list_init(&bio_list);
1500
1501 ret = alloc_rbio_pages(rbio);
1502 if (ret)
1503 goto cleanup;
1504
1505 atomic_set(&rbio->bbio->error, 0);
1506
1507 /*
1508 * read everything that hasn't failed.
1509 */
1510 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511 if (rbio->faila == stripe ||
1512 rbio->failb == stripe)
1513 continue;
1514
1515 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516 struct page *p;
1517
1518 /*
1519 * the rmw code may have already read this
1520 * page in
1521 */
1522 p = rbio_stripe_page(rbio, stripe, pagenr);
1523 if (PageUptodate(p))
1524 continue;
1525
1526 ret = rbio_add_io_page(rbio, &bio_list,
1527 rbio_stripe_page(rbio, stripe, pagenr),
1528 stripe, pagenr, rbio->stripe_len);
1529 if (ret < 0)
1530 goto cleanup;
1531 }
1532 }
1533
1534 bios_to_read = bio_list_size(&bio_list);
1535 if (!bios_to_read) {
1536 /*
1537 * we might have no bios to read just because the pages
1538 * were up to date, or we might have no bios to read because
1539 * the devices were gone.
1540 */
1541 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542 __raid_recover_end_io(rbio);
1543 goto out;
1544 } else {
1545 goto cleanup;
1546 }
1547 }
1548
1549 /*
1550 * the bbio may be freed once we submit the last bio. Make sure
1551 * not to touch it after that
1552 */
1553 atomic_set(&bbio->stripes_pending, bios_to_read);
1554 while (1) {
1555 bio = bio_list_pop(&bio_list);
1556 if (!bio)
1557 break;
1558
1559 bio->bi_private = rbio;
1560 bio->bi_end_io = raid_recover_end_io;
1561
1562 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563 BTRFS_WQ_ENDIO_RAID56);
1564
1565 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566 submit_bio(READ, bio);
1567 }
1568out:
1569 return 0;
1570
1571cleanup:
1572 if (rbio->read_rebuild)
1573 rbio_orig_end_io(rbio, -EIO, 0);
1574 return -EIO;
1575}
1576
1577/*
1578 * the main entry point for reads from the higher layers. This
1579 * is really only called when the normal read path had a failure,
1580 * so we assume the bio they send down corresponds to a failed part
1581 * of the drive.
1582 */
1583int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584 struct btrfs_bio *bbio, u64 *raid_map,
1585 u64 stripe_len, int mirror_num)
1586{
1587 struct btrfs_raid_bio *rbio;
1588 int ret;
1589
1590 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591 if (IS_ERR(rbio)) {
1592 return PTR_ERR(rbio);
1593 }
1594
1595 rbio->read_rebuild = 1;
1596 bio_list_add(&rbio->bio_list, bio);
1597 rbio->bio_list_bytes = bio->bi_size;
1598
1599 rbio->faila = find_logical_bio_stripe(rbio, bio);
1600 if (rbio->faila == -1) {
1601 BUG();
1602 kfree(rbio);
1603 return -EIO;
1604 }
1605
1606 /*
1607 * reconstruct from the q stripe if they are
1608 * asking for mirror 3
1609 */
1610 if (mirror_num == 3)
1611 rbio->failb = bbio->num_stripes - 2;
1612
1613 ret = lock_stripe_add(rbio);
1614
1615 /*
1616 * __raid56_parity_recover will end the bio with
1617 * any errors it hits. We don't want to return
1618 * its error value up the stack because our caller
1619 * will end up calling bio_endio with any nonzero
1620 * return
1621 */
1622 if (ret == 0)
1623 __raid56_parity_recover(rbio);
1624 /*
1625 * our rbio has been added to the list of
1626 * rbios that will be handled after the
1627 * currently lock owner is done
1628 */
1629 return 0;
1630
1631}
1632
1633static void rmw_work(struct btrfs_work *work)
1634{
1635 struct btrfs_raid_bio *rbio;
1636
1637 rbio = container_of(work, struct btrfs_raid_bio, work);
1638 raid56_rmw_stripe(rbio);
1639}
1640
1641static void read_rebuild_work(struct btrfs_work *work)
1642{
1643 struct btrfs_raid_bio *rbio;
1644
1645 rbio = container_of(work, struct btrfs_raid_bio, work);
1646 __raid56_parity_recover(rbio);
1647}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..bc35ed4238b8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -2246,6 +2247,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2246 struct btrfs_device *extent_dev; 2247 struct btrfs_device *extent_dev;
2247 int extent_mirror_num; 2248 int extent_mirror_num;
2248 2249
2250 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2251 BTRFS_BLOCK_GROUP_RAID6)) {
2252 if (num >= nr_data_stripes(map)) {
2253 return 0;
2254 }
2255 }
2256
2249 nstripes = length; 2257 nstripes = length;
2250 offset = 0; 2258 offset = 0;
2251 do_div(nstripes, map->stripe_len); 2259 do_div(nstripes, map->stripe_len);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..a065dec0e330 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -686,7 +686,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
686 struct extent_state *cached_state = NULL; 686 struct extent_state *cached_state = NULL;
687 u64 start = 0; 687 u64 start = 0;
688 u64 end; 688 u64 end;
689 struct blk_plug plug;
689 690
691 blk_start_plug(&plug);
690 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 692 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
691 mark, &cached_state)) { 693 mark, &cached_state)) {
692 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 694 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -700,6 +702,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
700 } 702 }
701 if (err) 703 if (err)
702 werr = err; 704 werr = err;
705 blk_finish_plug(&plug);
703 return werr; 706 return werr;
704} 707}
705 708
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 485a5423e3c6..c372264b85bf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -1389,6 +1392,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1389 } 1392 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1393 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391 1394
1395 if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 |
1396 BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) {
1397 printk(KERN_ERR "btrfs: unable to go below three devices "
1398 "on raid5 or raid6\n");
1399 ret = -EINVAL;
1400 goto out;
1401 }
1402
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1403 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1393 printk(KERN_ERR "btrfs: unable to go below four devices " 1404 printk(KERN_ERR "btrfs: unable to go below four devices "
1394 "on raid10\n"); 1405 "on raid10\n");
@@ -1403,6 +1414,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1414 goto out;
1404 } 1415 }
1405 1416
1417 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1418 root->fs_info->fs_devices->rw_devices <= 2) {
1419 printk(KERN_ERR "btrfs: unable to go below two "
1420 "devices on raid5\n");
1421 ret = -EINVAL;
1422 goto out;
1423 }
1424 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1425 root->fs_info->fs_devices->rw_devices <= 3) {
1426 printk(KERN_ERR "btrfs: unable to go below three "
1427 "devices on raid6\n");
1428 ret = -EINVAL;
1429 goto out;
1430 }
1431
1406 if (strcmp(device_path, "missing") == 0) { 1432 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1433 struct list_head *devices;
1408 struct btrfs_device *tmp; 1434 struct btrfs_device *tmp;
@@ -2657,11 +2683,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2657 return 0; 2683 return 0;
2658 2684
2659 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2685 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2660 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2686 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2661 factor = 2; 2687 factor = num_stripes / 2;
2662 else 2688 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2663 factor = 1; 2689 factor = num_stripes - 1;
2664 factor = num_stripes / factor; 2690 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2691 factor = num_stripes - 2;
2692 } else {
2693 factor = num_stripes;
2694 }
2665 2695
2666 for (i = 0; i < num_stripes; i++) { 2696 for (i = 0; i < num_stripes; i++) {
2667 stripe = btrfs_stripe_nr(chunk, i); 2697 stripe = btrfs_stripe_nr(chunk, i);
@@ -2976,6 +3006,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2976 int mixed = 0; 3006 int mixed = 0;
2977 int ret; 3007 int ret;
2978 u64 num_devices; 3008 u64 num_devices;
3009 int cancel = 0;
2979 3010
2980 if (btrfs_fs_closing(fs_info) || 3011 if (btrfs_fs_closing(fs_info) ||
2981 atomic_read(&fs_info->balance_pause_req) || 3012 atomic_read(&fs_info->balance_pause_req) ||
@@ -3018,7 +3049,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3018 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3049 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3019 else 3050 else
3020 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3051 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3021 BTRFS_BLOCK_GROUP_RAID10); 3052 BTRFS_BLOCK_GROUP_RAID10 |
3053 BTRFS_BLOCK_GROUP_RAID5 |
3054 BTRFS_BLOCK_GROUP_RAID6);
3022 3055
3023 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3056 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3024 (!alloc_profile_is_valid(bctl->data.target, 1) || 3057 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3058,7 +3091,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3058 3091
3059 /* allow to reduce meta or sys integrity only if force set */ 3092 /* allow to reduce meta or sys integrity only if force set */
3060 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3093 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3061 BTRFS_BLOCK_GROUP_RAID10; 3094 BTRFS_BLOCK_GROUP_RAID10 |
3095 BTRFS_BLOCK_GROUP_RAID5 |
3096 BTRFS_BLOCK_GROUP_RAID6;
3097
3062 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3098 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3063 (fs_info->avail_system_alloc_bits & allowed) && 3099 (fs_info->avail_system_alloc_bits & allowed) &&
3064 !(bctl->sys.target & allowed)) || 3100 !(bctl->sys.target & allowed)) ||
@@ -3124,15 +3160,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3124 } 3160 }
3125 3161
3126 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3162 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3127 balance_need_close(fs_info)) { 3163 balance_need_close(fs_info))
3128 __cancel_balance(fs_info); 3164 cancel = 1;
3129 }
3130 3165
3131 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3166 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3132 fs_info->num_tolerated_disk_barrier_failures = 3167 fs_info->num_tolerated_disk_barrier_failures =
3133 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3168 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3134 } 3169 }
3135 3170
3171 if (cancel)
3172 __cancel_balance(fs_info);
3173
3136 wake_up(&fs_info->balance_wait_q); 3174 wake_up(&fs_info->balance_wait_q);
3137 3175
3138 return ret; 3176 return ret;
@@ -3493,13 +3531,45 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3493} 3531}
3494 3532
3495struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3533struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3534 /*
3535 * sub_stripes info for map,
3536 * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise
3537 * devs_max -- max devices per stripe, 0 for unlimited
3538 * devs_min -- min devices per stripe
3539 * devs_increment -- ndevs must be a multiple of this
3540 * ncopies -- how many copies of the data we have
3541 */
3496 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3542 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3497 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3543 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3498 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3544 { 1, 2, 1, 1, 1, 2 /* dup */ },
3499 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3545 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3500 { 1, 1, 0, 1, 1, 1 /* single */ }, 3546 { 1, 1, 0, 1, 1, 1 /* single */ },
3547 { 1, 1, 0, 2, 1, 2 /* raid5 */ },
3548 { 1, 1, 0, 3, 1, 3 /* raid6 */ },
3501}; 3549};
3502 3550
3551static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3552{
3553 /* TODO allow them to set a preferred stripe size */
3554 return 64 * 1024;
3555}
3556
3557static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3558{
3559 u64 features;
3560
3561 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3562 return;
3563
3564 features = btrfs_super_incompat_flags(info->super_copy);
3565 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3566 return;
3567
3568 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3569 btrfs_set_super_incompat_flags(info->super_copy, features);
3570 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3571}
3572
3503static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3573static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *extent_root, 3574 struct btrfs_root *extent_root,
3505 struct map_lookup **map_ret, 3575 struct map_lookup **map_ret,
@@ -3515,6 +3585,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_device_info *devices_info = NULL; 3585 struct btrfs_device_info *devices_info = NULL;
3516 u64 total_avail; 3586 u64 total_avail;
3517 int num_stripes; /* total number of stripes to allocate */ 3587 int num_stripes; /* total number of stripes to allocate */
3588 int data_stripes; /* number of stripes that count for
3589 block group size */
3518 int sub_stripes; /* sub_stripes info for map */ 3590 int sub_stripes; /* sub_stripes info for map */
3519 int dev_stripes; /* stripes per dev */ 3591 int dev_stripes; /* stripes per dev */
3520 int devs_max; /* max devs to use */ 3592 int devs_max; /* max devs to use */
@@ -3526,6 +3598,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 u64 max_chunk_size; 3598 u64 max_chunk_size;
3527 u64 stripe_size; 3599 u64 stripe_size;
3528 u64 num_bytes; 3600 u64 num_bytes;
3601 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3529 int ndevs; 3602 int ndevs;
3530 int i; 3603 int i;
3531 int j; 3604 int j;
@@ -3651,16 +3724,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3651 stripe_size = devices_info[ndevs-1].max_avail; 3724 stripe_size = devices_info[ndevs-1].max_avail;
3652 num_stripes = ndevs * dev_stripes; 3725 num_stripes = ndevs * dev_stripes;
3653 3726
3727 /*
3728 * this will have to be fixed for RAID1 and RAID10 over
3729 * more drives
3730 */
3731 data_stripes = num_stripes / ncopies;
3732
3654 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3733 if (stripe_size * ndevs > max_chunk_size * ncopies) {
3655 stripe_size = max_chunk_size * ncopies; 3734 stripe_size = max_chunk_size * ncopies;
3656 do_div(stripe_size, ndevs); 3735 do_div(stripe_size, ndevs);
3657 } 3736 }
3658 3737 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3738 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3739 btrfs_super_stripesize(info->super_copy));
3740 data_stripes = num_stripes - 1;
3741 }
3742 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3743 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3744 btrfs_super_stripesize(info->super_copy));
3745 data_stripes = num_stripes - 2;
3746 }
3659 do_div(stripe_size, dev_stripes); 3747 do_div(stripe_size, dev_stripes);
3660 3748
3661 /* align to BTRFS_STRIPE_LEN */ 3749 /* align to BTRFS_STRIPE_LEN */
3662 do_div(stripe_size, BTRFS_STRIPE_LEN); 3750 do_div(stripe_size, raid_stripe_len);
3663 stripe_size *= BTRFS_STRIPE_LEN; 3751 stripe_size *= raid_stripe_len;
3664 3752
3665 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3753 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3666 if (!map) { 3754 if (!map) {
@@ -3678,14 +3766,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3678 } 3766 }
3679 } 3767 }
3680 map->sector_size = extent_root->sectorsize; 3768 map->sector_size = extent_root->sectorsize;
3681 map->stripe_len = BTRFS_STRIPE_LEN; 3769 map->stripe_len = raid_stripe_len;
3682 map->io_align = BTRFS_STRIPE_LEN; 3770 map->io_align = raid_stripe_len;
3683 map->io_width = BTRFS_STRIPE_LEN; 3771 map->io_width = raid_stripe_len;
3684 map->type = type; 3772 map->type = type;
3685 map->sub_stripes = sub_stripes; 3773 map->sub_stripes = sub_stripes;
3686 3774
3687 *map_ret = map; 3775 *map_ret = map;
3688 num_bytes = stripe_size * (num_stripes / ncopies); 3776 num_bytes = stripe_size * data_stripes;
3689 3777
3690 *stripe_size_out = stripe_size; 3778 *stripe_size_out = stripe_size;
3691 *num_bytes_out = num_bytes; 3779 *num_bytes_out = num_bytes;
@@ -3734,6 +3822,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3734 } 3822 }
3735 } 3823 }
3736 3824
3825 check_raid56_incompat_flag(extent_root->fs_info, type);
3826
3737 kfree(devices_info); 3827 kfree(devices_info);
3738 return 0; 3828 return 0;
3739 3829
@@ -4003,6 +4093,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4003 ret = map->num_stripes; 4093 ret = map->num_stripes;
4004 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4094 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4005 ret = map->sub_stripes; 4095 ret = map->sub_stripes;
4096 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4097 ret = 2;
4098 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4099 ret = 3;
4006 else 4100 else
4007 ret = 1; 4101 ret = 1;
4008 free_extent_map(em); 4102 free_extent_map(em);
@@ -4015,6 +4109,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4015 return ret; 4109 return ret;
4016} 4110}
4017 4111
4112unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4113 struct btrfs_mapping_tree *map_tree,
4114 u64 logical)
4115{
4116 struct extent_map *em;
4117 struct map_lookup *map;
4118 struct extent_map_tree *em_tree = &map_tree->map_tree;
4119 unsigned long len = root->sectorsize;
4120
4121 read_lock(&em_tree->lock);
4122 em = lookup_extent_mapping(em_tree, logical, len);
4123 read_unlock(&em_tree->lock);
4124 BUG_ON(!em);
4125
4126 BUG_ON(em->start > logical || em->start + em->len < logical);
4127 map = (struct map_lookup *)em->bdev;
4128 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4129 BTRFS_BLOCK_GROUP_RAID6)) {
4130 len = map->stripe_len * nr_data_stripes(map);
4131 }
4132 free_extent_map(em);
4133 return len;
4134}
4135
4136int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4137 u64 logical, u64 len, int mirror_num)
4138{
4139 struct extent_map *em;
4140 struct map_lookup *map;
4141 struct extent_map_tree *em_tree = &map_tree->map_tree;
4142 int ret = 0;
4143
4144 read_lock(&em_tree->lock);
4145 em = lookup_extent_mapping(em_tree, logical, len);
4146 read_unlock(&em_tree->lock);
4147 BUG_ON(!em);
4148
4149 BUG_ON(em->start > logical || em->start + em->len < logical);
4150 map = (struct map_lookup *)em->bdev;
4151 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4152 BTRFS_BLOCK_GROUP_RAID6))
4153 ret = 1;
4154 free_extent_map(em);
4155 return ret;
4156}
4157
4018static int find_live_mirror(struct btrfs_fs_info *fs_info, 4158static int find_live_mirror(struct btrfs_fs_info *fs_info,
4019 struct map_lookup *map, int first, int num, 4159 struct map_lookup *map, int first, int num,
4020 int optimal, int dev_replace_is_ongoing) 4160 int optimal, int dev_replace_is_ongoing)
@@ -4052,10 +4192,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4052 return optimal; 4192 return optimal;
4053} 4193}
4054 4194
4195static inline int parity_smaller(u64 a, u64 b)
4196{
4197 return a > b;
4198}
4199
4200/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4201static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4202{
4203 struct btrfs_bio_stripe s;
4204 int i;
4205 u64 l;
4206 int again = 1;
4207
4208 while (again) {
4209 again = 0;
4210 for (i = 0; i < bbio->num_stripes - 1; i++) {
4211 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4212 s = bbio->stripes[i];
4213 l = raid_map[i];
4214 bbio->stripes[i] = bbio->stripes[i+1];
4215 raid_map[i] = raid_map[i+1];
4216 bbio->stripes[i+1] = s;
4217 raid_map[i+1] = l;
4218 again = 1;
4219 }
4220 }
4221 }
4222}
4223
4055static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4224static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4056 u64 logical, u64 *length, 4225 u64 logical, u64 *length,
4057 struct btrfs_bio **bbio_ret, 4226 struct btrfs_bio **bbio_ret,
4058 int mirror_num) 4227 int mirror_num, u64 **raid_map_ret)
4059{ 4228{
4060 struct extent_map *em; 4229 struct extent_map *em;
4061 struct map_lookup *map; 4230 struct map_lookup *map;
@@ -4067,6 +4236,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 stripe_nr; 4236 u64 stripe_nr;
4068 u64 stripe_nr_orig; 4237 u64 stripe_nr_orig;
4069 u64 stripe_nr_end; 4238 u64 stripe_nr_end;
4239 u64 stripe_len;
4240 u64 *raid_map = NULL;
4070 int stripe_index; 4241 int stripe_index;
4071 int i; 4242 int i;
4072 int ret = 0; 4243 int ret = 0;
@@ -4078,6 +4249,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 int num_alloc_stripes; 4249 int num_alloc_stripes;
4079 int patch_the_first_stripe_for_dev_replace = 0; 4250 int patch_the_first_stripe_for_dev_replace = 0;
4080 u64 physical_to_patch_in_first_stripe = 0; 4251 u64 physical_to_patch_in_first_stripe = 0;
4252 u64 raid56_full_stripe_start = (u64)-1;
4081 4253
4082 read_lock(&em_tree->lock); 4254 read_lock(&em_tree->lock);
4083 em = lookup_extent_mapping(em_tree, logical, *length); 4255 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4094,29 +4266,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4094 map = (struct map_lookup *)em->bdev; 4266 map = (struct map_lookup *)em->bdev;
4095 offset = logical - em->start; 4267 offset = logical - em->start;
4096 4268
4269 if (mirror_num > map->num_stripes)
4270 mirror_num = 0;
4271
4272 stripe_len = map->stripe_len;
4097 stripe_nr = offset; 4273 stripe_nr = offset;
4098 /* 4274 /*
4099 * stripe_nr counts the total number of stripes we have to stride 4275 * stripe_nr counts the total number of stripes we have to stride
4100 * to get to this block 4276 * to get to this block
4101 */ 4277 */
4102 do_div(stripe_nr, map->stripe_len); 4278 do_div(stripe_nr, stripe_len);
4103 4279
4104 stripe_offset = stripe_nr * map->stripe_len; 4280 stripe_offset = stripe_nr * stripe_len;
4105 BUG_ON(offset < stripe_offset); 4281 BUG_ON(offset < stripe_offset);
4106 4282
4107 /* stripe_offset is the offset of this block in its stripe*/ 4283 /* stripe_offset is the offset of this block in its stripe*/
4108 stripe_offset = offset - stripe_offset; 4284 stripe_offset = offset - stripe_offset;
4109 4285
4110 if (rw & REQ_DISCARD) 4286 /* if we're here for raid56, we need to know the stripe aligned start */
4287 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4288 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4289 raid56_full_stripe_start = offset;
4290
4291 /* allow a write of a full stripe, but make sure we don't
4292 * allow straddling of stripes
4293 */
4294 do_div(raid56_full_stripe_start, full_stripe_len);
4295 raid56_full_stripe_start *= full_stripe_len;
4296 }
4297
4298 if (rw & REQ_DISCARD) {
4299 /* we don't discard raid56 yet */
4300 if (map->type &
4301 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4302 ret = -EOPNOTSUPP;
4303 goto out;
4304 }
4111 *length = min_t(u64, em->len - offset, *length); 4305 *length = min_t(u64, em->len - offset, *length);
4112 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4306 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4113 /* we limit the length of each bio to what fits in a stripe */ 4307 u64 max_len;
4114 *length = min_t(u64, em->len - offset, 4308 /* For writes to RAID[56], allow a full stripeset across all disks.
4115 map->stripe_len - stripe_offset); 4309 For other RAID types and for RAID[56] reads, just allow a single
4310 stripe (on a single disk). */
4311 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4312 (rw & REQ_WRITE)) {
4313 max_len = stripe_len * nr_data_stripes(map) -
4314 (offset - raid56_full_stripe_start);
4315 } else {
4316 /* we limit the length of each bio to what fits in a stripe */
4317 max_len = stripe_len - stripe_offset;
4318 }
4319 *length = min_t(u64, em->len - offset, max_len);
4116 } else { 4320 } else {
4117 *length = em->len - offset; 4321 *length = em->len - offset;
4118 } 4322 }
4119 4323
4324 /* This is for when we're called from btrfs_merge_bio_hook() and all
4325 it cares about is the length */
4120 if (!bbio_ret) 4326 if (!bbio_ret)
4121 goto out; 4327 goto out;
4122 4328
@@ -4149,7 +4355,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4149 u64 physical_of_found = 0; 4355 u64 physical_of_found = 0;
4150 4356
4151 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4357 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4152 logical, &tmp_length, &tmp_bbio, 0); 4358 logical, &tmp_length, &tmp_bbio, 0, NULL);
4153 if (ret) { 4359 if (ret) {
4154 WARN_ON(tmp_bbio != NULL); 4360 WARN_ON(tmp_bbio != NULL);
4155 goto out; 4361 goto out;
@@ -4215,6 +4421,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4215 do_div(stripe_nr_end, map->stripe_len); 4421 do_div(stripe_nr_end, map->stripe_len);
4216 stripe_end_offset = stripe_nr_end * map->stripe_len - 4422 stripe_end_offset = stripe_nr_end * map->stripe_len -
4217 (offset + *length); 4423 (offset + *length);
4424
4218 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4425 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4219 if (rw & REQ_DISCARD) 4426 if (rw & REQ_DISCARD)
4220 num_stripes = min_t(u64, map->num_stripes, 4427 num_stripes = min_t(u64, map->num_stripes,
@@ -4265,6 +4472,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4265 dev_replace_is_ongoing); 4472 dev_replace_is_ongoing);
4266 mirror_num = stripe_index - old_stripe_index + 1; 4473 mirror_num = stripe_index - old_stripe_index + 1;
4267 } 4474 }
4475
4476 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4477 BTRFS_BLOCK_GROUP_RAID6)) {
4478 u64 tmp;
4479
4480 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4481 && raid_map_ret) {
4482 int i, rot;
4483
4484 /* push stripe_nr back to the start of the full stripe */
4485 stripe_nr = raid56_full_stripe_start;
4486 do_div(stripe_nr, stripe_len);
4487
4488 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4489
4490 /* RAID[56] write or recovery. Return all stripes */
4491 num_stripes = map->num_stripes;
4492 max_errors = nr_parity_stripes(map);
4493
4494 raid_map = kmalloc(sizeof(u64) * num_stripes,
4495 GFP_NOFS);
4496 if (!raid_map) {
4497 ret = -ENOMEM;
4498 goto out;
4499 }
4500
4501 /* Work out the disk rotation on this stripe-set */
4502 tmp = stripe_nr;
4503 rot = do_div(tmp, num_stripes);
4504
4505 /* Fill in the logical address of each stripe */
4506 tmp = stripe_nr * nr_data_stripes(map);
4507 for (i = 0; i < nr_data_stripes(map); i++)
4508 raid_map[(i+rot) % num_stripes] =
4509 em->start + (tmp + i) * map->stripe_len;
4510
4511 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4512 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4513 raid_map[(i+rot+1) % num_stripes] =
4514 RAID6_Q_STRIPE;
4515
4516 *length = map->stripe_len;
4517 stripe_index = 0;
4518 stripe_offset = 0;
4519 } else {
4520 /*
4521 * Mirror #0 or #1 means the original data block.
4522 * Mirror #2 is RAID5 parity block.
4523 * Mirror #3 is RAID6 Q block.
4524 */
4525 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4526 if (mirror_num > 1)
4527 stripe_index = nr_data_stripes(map) +
4528 mirror_num - 2;
4529
4530 /* We distribute the parity blocks across stripes */
4531 tmp = stripe_nr + stripe_index;
4532 stripe_index = do_div(tmp, map->num_stripes);
4533 }
4268 } else { 4534 } else {
4269 /* 4535 /*
4270 * after this do_div call, stripe_nr is the number of stripes 4536 * after this do_div call, stripe_nr is the number of stripes
@@ -4373,8 +4639,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4373 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4639 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4374 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4640 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4375 BTRFS_BLOCK_GROUP_RAID10 | 4641 BTRFS_BLOCK_GROUP_RAID10 |
4642 BTRFS_BLOCK_GROUP_RAID5 |
4376 BTRFS_BLOCK_GROUP_DUP)) { 4643 BTRFS_BLOCK_GROUP_DUP)) {
4377 max_errors = 1; 4644 max_errors = 1;
4645 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4646 max_errors = 2;
4378 } 4647 }
4379 } 4648 }
4380 4649
@@ -4475,6 +4744,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4475 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4744 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4476 bbio->mirror_num = map->num_stripes + 1; 4745 bbio->mirror_num = map->num_stripes + 1;
4477 } 4746 }
4747 if (raid_map) {
4748 sort_parity_stripes(bbio, raid_map);
4749 *raid_map_ret = raid_map;
4750 }
4478out: 4751out:
4479 if (dev_replace_is_ongoing) 4752 if (dev_replace_is_ongoing)
4480 btrfs_dev_replace_unlock(dev_replace); 4753 btrfs_dev_replace_unlock(dev_replace);
@@ -4487,7 +4760,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4487 struct btrfs_bio **bbio_ret, int mirror_num) 4760 struct btrfs_bio **bbio_ret, int mirror_num)
4488{ 4761{
4489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4762 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4490 mirror_num); 4763 mirror_num, NULL);
4491} 4764}
4492 4765
4493int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4766int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4501,6 +4774,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4501 u64 bytenr; 4774 u64 bytenr;
4502 u64 length; 4775 u64 length;
4503 u64 stripe_nr; 4776 u64 stripe_nr;
4777 u64 rmap_len;
4504 int i, j, nr = 0; 4778 int i, j, nr = 0;
4505 4779
4506 read_lock(&em_tree->lock); 4780 read_lock(&em_tree->lock);
@@ -4511,10 +4785,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4511 map = (struct map_lookup *)em->bdev; 4785 map = (struct map_lookup *)em->bdev;
4512 4786
4513 length = em->len; 4787 length = em->len;
4788 rmap_len = map->stripe_len;
4789
4514 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4790 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4515 do_div(length, map->num_stripes / map->sub_stripes); 4791 do_div(length, map->num_stripes / map->sub_stripes);
4516 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4792 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4517 do_div(length, map->num_stripes); 4793 do_div(length, map->num_stripes);
4794 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4795 BTRFS_BLOCK_GROUP_RAID6)) {
4796 do_div(length, nr_data_stripes(map));
4797 rmap_len = map->stripe_len * nr_data_stripes(map);
4798 }
4518 4799
4519 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4800 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4520 BUG_ON(!buf); /* -ENOMEM */ 4801 BUG_ON(!buf); /* -ENOMEM */
@@ -4534,8 +4815,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4534 do_div(stripe_nr, map->sub_stripes); 4815 do_div(stripe_nr, map->sub_stripes);
4535 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4816 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4536 stripe_nr = stripe_nr * map->num_stripes + i; 4817 stripe_nr = stripe_nr * map->num_stripes + i;
4537 } 4818 } /* else if RAID[56], multiply by nr_data_stripes().
4538 bytenr = chunk_start + stripe_nr * map->stripe_len; 4819 * Alternatively, just use rmap_len below instead of
4820 * map->stripe_len */
4821
4822 bytenr = chunk_start + stripe_nr * rmap_len;
4539 WARN_ON(nr >= map->num_stripes); 4823 WARN_ON(nr >= map->num_stripes);
4540 for (j = 0; j < nr; j++) { 4824 for (j = 0; j < nr; j++) {
4541 if (buf[j] == bytenr) 4825 if (buf[j] == bytenr)
@@ -4549,7 +4833,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4549 4833
4550 *logical = buf; 4834 *logical = buf;
4551 *naddrs = nr; 4835 *naddrs = nr;
4552 *stripe_len = map->stripe_len; 4836 *stripe_len = rmap_len;
4553 4837
4554 free_extent_map(em); 4838 free_extent_map(em);
4555 return 0; 4839 return 0;
@@ -4623,7 +4907,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4623 bio->bi_bdev = (struct block_device *) 4907 bio->bi_bdev = (struct block_device *)
4624 (unsigned long)bbio->mirror_num; 4908 (unsigned long)bbio->mirror_num;
4625 /* only send an error to the higher layers if it is 4909 /* only send an error to the higher layers if it is
4626 * beyond the tolerance of the multi-bio 4910 * beyond the tolerance of the btrfs bio
4627 */ 4911 */
4628 if (atomic_read(&bbio->error) > bbio->max_errors) { 4912 if (atomic_read(&bbio->error) > bbio->max_errors) {
4629 err = -EIO; 4913 err = -EIO;
@@ -4657,13 +4941,18 @@ struct async_sched {
4657 * This will add one bio to the pending list for a device and make sure 4941 * This will add one bio to the pending list for a device and make sure
4658 * the work struct is scheduled. 4942 * the work struct is scheduled.
4659 */ 4943 */
4660static noinline void schedule_bio(struct btrfs_root *root, 4944noinline void btrfs_schedule_bio(struct btrfs_root *root,
4661 struct btrfs_device *device, 4945 struct btrfs_device *device,
4662 int rw, struct bio *bio) 4946 int rw, struct bio *bio)
4663{ 4947{
4664 int should_queue = 1; 4948 int should_queue = 1;
4665 struct btrfs_pending_bios *pending_bios; 4949 struct btrfs_pending_bios *pending_bios;
4666 4950
4951 if (device->missing || !device->bdev) {
4952 bio_endio(bio, -EIO);
4953 return;
4954 }
4955
4667 /* don't bother with additional async steps for reads, right now */ 4956 /* don't bother with additional async steps for reads, right now */
4668 if (!(rw & REQ_WRITE)) { 4957 if (!(rw & REQ_WRITE)) {
4669 bio_get(bio); 4958 bio_get(bio);
@@ -4761,7 +5050,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4761#endif 5050#endif
4762 bio->bi_bdev = dev->bdev; 5051 bio->bi_bdev = dev->bdev;
4763 if (async) 5052 if (async)
4764 schedule_bio(root, dev, rw, bio); 5053 btrfs_schedule_bio(root, dev, rw, bio);
4765 else 5054 else
4766 btrfsic_submit_bio(rw, bio); 5055 btrfsic_submit_bio(rw, bio);
4767} 5056}
@@ -4820,6 +5109,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4820 u64 logical = (u64)bio->bi_sector << 9; 5109 u64 logical = (u64)bio->bi_sector << 9;
4821 u64 length = 0; 5110 u64 length = 0;
4822 u64 map_length; 5111 u64 map_length;
5112 u64 *raid_map = NULL;
4823 int ret; 5113 int ret;
4824 int dev_nr = 0; 5114 int dev_nr = 0;
4825 int total_devs = 1; 5115 int total_devs = 1;
@@ -4828,12 +5118,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4828 length = bio->bi_size; 5118 length = bio->bi_size;
4829 map_length = length; 5119 map_length = length;
4830 5120
4831 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5121 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4832 mirror_num); 5122 mirror_num, &raid_map);
4833 if (ret) 5123 if (ret) /* -ENOMEM */
4834 return ret; 5124 return ret;
4835 5125
4836 total_devs = bbio->num_stripes; 5126 total_devs = bbio->num_stripes;
5127 bbio->orig_bio = first_bio;
5128 bbio->private = first_bio->bi_private;
5129 bbio->end_io = first_bio->bi_end_io;
5130 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5131
5132 if (raid_map) {
5133 /* In this case, map_length has been set to the length of
5134 a single stripe; not the whole write */
5135 if (rw & WRITE) {
5136 return raid56_parity_write(root, bio, bbio,
5137 raid_map, map_length);
5138 } else {
5139 return raid56_parity_recover(root, bio, bbio,
5140 raid_map, map_length,
5141 mirror_num);
5142 }
5143 }
5144
4837 if (map_length < length) { 5145 if (map_length < length) {
4838 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5146 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4839 "len %llu\n", (unsigned long long)logical, 5147 "len %llu\n", (unsigned long long)logical,
@@ -4842,11 +5150,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4842 BUG(); 5150 BUG();
4843 } 5151 }
4844 5152
4845 bbio->orig_bio = first_bio;
4846 bbio->private = first_bio->bi_private;
4847 bbio->end_io = first_bio->bi_end_io;
4848 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4849
4850 while (dev_nr < total_devs) { 5153 while (dev_nr < total_devs) {
4851 dev = bbio->stripes[dev_nr].dev; 5154 dev = bbio->stripes[dev_nr].dev;
4852 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5155 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..0c2b856ecd98 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{