aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/ctree.h35
-rw-r--r--fs/btrfs/disk-io.c62
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c88
-rw-r--r--fs/btrfs/extent_io.c18
-rw-r--r--fs/btrfs/free-space-cache.c50
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/raid56.c1647
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/scrub.c8
-rw-r--r--fs/btrfs/transaction.c3
-rw-r--r--fs/btrfs/volumes.c385
-rw-r--r--fs/btrfs/volumes.h9
15 files changed, 2283 insertions, 102 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..4f5dc93fa2f8 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,8 @@ config BTRFS_FS
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS 7 select LZO_COMPRESS
8 select LZO_DECOMPRESS 8 select LZO_DECOMPRESS
9 select RAID6_PQ
10
9 help 11 help
10 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a new filesystem with extents, writable snapshotting,
11 support for multiple devices and many more features. 13 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0ab51be6879f..0cce3aafbd62 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -502,6 +502,7 @@ struct btrfs_super_block {
502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 502#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
503 503
504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 504#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
505#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
505 506
506#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 507#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
507#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 508#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -511,6 +512,7 @@ struct btrfs_super_block {
511 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 512 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
512 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 513 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
513 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 514 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
515 BTRFS_FEATURE_INCOMPAT_RAID56 | \
514 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 516 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
515 517
516/* 518/*
@@ -952,8 +954,10 @@ struct btrfs_dev_replace_item {
952#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 954#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
953#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 955#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
954#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 956#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
957#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
958#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
955#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 959#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
956#define BTRFS_NR_RAID_TYPES 5 960#define BTRFS_NR_RAID_TYPES 7
957 961
958#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 962#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
959 BTRFS_BLOCK_GROUP_SYSTEM | \ 963 BTRFS_BLOCK_GROUP_SYSTEM | \
@@ -961,6 +965,8 @@ struct btrfs_dev_replace_item {
961 965
962#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 966#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
963 BTRFS_BLOCK_GROUP_RAID1 | \ 967 BTRFS_BLOCK_GROUP_RAID1 | \
968 BTRFS_BLOCK_GROUP_RAID5 | \
969 BTRFS_BLOCK_GROUP_RAID6 | \
964 BTRFS_BLOCK_GROUP_DUP | \ 970 BTRFS_BLOCK_GROUP_DUP | \
965 BTRFS_BLOCK_GROUP_RAID10) 971 BTRFS_BLOCK_GROUP_RAID10)
966/* 972/*
@@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache {
1185 u64 flags; 1191 u64 flags;
1186 u64 sectorsize; 1192 u64 sectorsize;
1187 u64 cache_generation; 1193 u64 cache_generation;
1194
1195 /* for raid56, this is a full stripe, without parity */
1196 unsigned long full_stripe_len;
1197
1188 unsigned int ro:1; 1198 unsigned int ro:1;
1189 unsigned int dirty:1; 1199 unsigned int dirty:1;
1190 unsigned int iref:1; 1200 unsigned int iref:1;
@@ -1225,6 +1235,20 @@ struct seq_list {
1225 u64 seq; 1235 u64 seq;
1226}; 1236};
1227 1237
1238/* used by the raid56 code to lock stripes for read/modify/write */
1239struct btrfs_stripe_hash {
1240 struct list_head hash_list;
1241 wait_queue_head_t wait;
1242 spinlock_t lock;
1243};
1244
1245/* used by the raid56 code to lock stripes for read/modify/write */
1246struct btrfs_stripe_hash_table {
1247 struct btrfs_stripe_hash *table;
1248};
1249
1250#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1251
1228/* fs_info */ 1252/* fs_info */
1229struct reloc_control; 1253struct reloc_control;
1230struct btrfs_device; 1254struct btrfs_device;
@@ -1307,6 +1331,13 @@ struct btrfs_fs_info {
1307 struct mutex cleaner_mutex; 1331 struct mutex cleaner_mutex;
1308 struct mutex chunk_mutex; 1332 struct mutex chunk_mutex;
1309 struct mutex volume_mutex; 1333 struct mutex volume_mutex;
1334
1335 /* this is used during read/modify/write to make sure
1336 * no two ios are trying to mod the same stripe at the same
1337 * time
1338 */
1339 struct btrfs_stripe_hash_table *stripe_hash_table;
1340
1310 /* 1341 /*
1311 * this protects the ordered operations list only while we are 1342 * this protects the ordered operations list only while we are
1312 * processing all of the entries on it. This way we make 1343 * processing all of the entries on it. This way we make
@@ -1395,6 +1426,8 @@ struct btrfs_fs_info {
1395 struct btrfs_workers flush_workers; 1426 struct btrfs_workers flush_workers;
1396 struct btrfs_workers endio_workers; 1427 struct btrfs_workers endio_workers;
1397 struct btrfs_workers endio_meta_workers; 1428 struct btrfs_workers endio_meta_workers;
1429 struct btrfs_workers endio_raid56_workers;
1430 struct btrfs_workers rmw_workers;
1398 struct btrfs_workers endio_meta_write_workers; 1431 struct btrfs_workers endio_meta_write_workers;
1399 struct btrfs_workers endio_write_workers; 1432 struct btrfs_workers endio_write_workers;
1400 struct btrfs_workers endio_freespace_worker; 1433 struct btrfs_workers endio_freespace_worker;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 65f03670a952..e9fa7b4d18e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -639,8 +640,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 640 btree_readahead_hook(root, eb, eb->start, ret);
640 } 641 }
641 642
642 if (ret) 643 if (ret) {
644 /*
645 * our io error hook is going to dec the io pages
646 * again, we have to make sure it has something
647 * to decrement
648 */
649 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 650 clear_extent_buffer_uptodate(eb);
651 }
644 free_extent_buffer(eb); 652 free_extent_buffer(eb);
645out: 653out:
646 return ret; 654 return ret;
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 662 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 663 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 664 eb->read_mirror = failed_mirror;
665 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 666 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 667 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 668 return -EIO; /* we fixed nothing */
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 679 end_io_wq->work.flags = 0;
671 680
672 if (bio->bi_rw & REQ_WRITE) { 681 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 682 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 683 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 684 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 685 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 686 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 687 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
689 btrfs_queue_worker(&fs_info->endio_raid56_workers,
690 &end_io_wq->work);
679 else 691 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 692 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 693 &end_io_wq->work);
682 } else { 694 } else {
683 if (end_io_wq->metadata) 695 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
696 btrfs_queue_worker(&fs_info->endio_raid56_workers,
697 &end_io_wq->work);
698 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 699 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 700 &end_io_wq->work);
686 else 701 else
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 710 * 0 - if data
696 * 1 - if normal metadta 711 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 712 * 2 - if writing to the free space cache area
713 * 3 - raid parity work
698 */ 714 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 715int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 716 int metadata)
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2181 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2182 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2183
2184 ret = btrfs_alloc_stripe_hash_table(fs_info);
2185 if (ret) {
2186 err = -ENOMEM;
2187 goto fail_alloc;
2188 }
2189
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2190 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2191 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2192
@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2354 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2355 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2356 &fs_info->generic_worker);
2357 btrfs_init_workers(&fs_info->endio_raid56_workers,
2358 "endio-raid56", fs_info->thread_pool_size,
2359 &fs_info->generic_worker);
2360 btrfs_init_workers(&fs_info->rmw_workers,
2361 "rmw", fs_info->thread_pool_size,
2362 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2363 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2364 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2365 &fs_info->generic_worker);
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2378 */
2351 fs_info->endio_workers.idle_thresh = 4; 2379 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2380 fs_info->endio_meta_workers.idle_thresh = 4;
2381 fs_info->endio_raid56_workers.idle_thresh = 4;
2382 fs_info->rmw_workers.idle_thresh = 2;
2353 2383
2354 fs_info->endio_write_workers.idle_thresh = 2; 2384 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2385 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2396 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2397 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2398 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2399 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2400 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2401 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2402 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2403 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2710,6 +2742,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2742 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2743 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2744 btrfs_stop_workers(&fs_info->endio_meta_workers);
2745 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2746 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2747 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2748 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2749 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2728,6 +2762,7 @@ fail_bdi:
2728fail_srcu: 2762fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2763 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2764fail:
2765 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2766 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2767 return err;
2733 2768
@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3111 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3112 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3113 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3114 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3115 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3116 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3117 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3118 num_tolerated_disk_barrier_failures = 1;
3119 } else if (flags &
3120 BTRFS_BLOCK_GROUP_RAID5) {
3121 num_tolerated_disk_barrier_failures = 2;
3122 }
3123 }
3084 } 3124 }
3085 } 3125 }
3086 up_read(&sinfo->groups_sem); 3126 up_read(&sinfo->groups_sem);
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3424 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3425 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3426 btrfs_stop_workers(&fs_info->endio_meta_workers);
3427 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3428 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3429 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3430 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3431 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
3404 bdi_destroy(&fs_info->bdi); 3446 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3447 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3448
3449 btrfs_free_stripe_hash_table(fs_info);
3450
3407 return 0; 3451 return 0;
3408} 3452}
3409 3453
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d133edfcd449..3345f68fc64b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -3276,6 +3279,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3279 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3280 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3281 u64 target;
3282 u64 tmp;
3279 3283
3280 /* 3284 /*
3281 * see if restripe for this chunk_type is in progress, if so 3285 * see if restripe for this chunk_type is in progress, if so
@@ -3292,30 +3296,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3296 }
3293 spin_unlock(&root->fs_info->balance_lock); 3297 spin_unlock(&root->fs_info->balance_lock);
3294 3298
3299 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3300 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3301 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3302 BTRFS_BLOCK_GROUP_RAID5);
3303 if (num_devices < 3)
3304 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3305 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3306 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3307
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3308 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3309 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3310 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3311 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3312
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3313 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3314 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3315 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3316 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3317 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3318 tmp = BTRFS_BLOCK_GROUP_RAID10;
3319 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3320 tmp = BTRFS_BLOCK_GROUP_RAID1;
3321 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3322 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3323
3318 return extended_to_chunk(flags); 3324 return extended_to_chunk(flags | tmp);
3319} 3325}
3320 3326
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3327static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3333,6 +3339,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3339u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3340{
3335 u64 flags; 3341 u64 flags;
3342 u64 ret;
3336 3343
3337 if (data) 3344 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3345 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3348,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3348 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3349 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3350
3344 return get_alloc_profile(root, flags); 3351 ret = get_alloc_profile(root, flags);
3352 return ret;
3345} 3353}
3346 3354
3347/* 3355/*
@@ -3516,8 +3524,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3524{
3517 u64 num_dev; 3525 u64 num_dev;
3518 3526
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3527 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3528 BTRFS_BLOCK_GROUP_RAID0 |
3529 BTRFS_BLOCK_GROUP_RAID5 |
3530 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3531 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3532 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3533 num_dev = 2;
@@ -3667,7 +3677,9 @@ static int can_overcommit(struct btrfs_root *root,
3667 3677
3668 /* 3678 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3679 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3680 * space is actually useable. For raid56, the space info used
3681 * doesn't include the parity drive, so we don't have to
3682 * change the math
3671 */ 3683 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3684 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3685 BTRFS_BLOCK_GROUP_RAID1 |
@@ -5455,10 +5467,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5455 return ret; 5467 return ret;
5456} 5468}
5457 5469
5458static u64 stripe_align(struct btrfs_root *root, u64 val) 5470static u64 stripe_align(struct btrfs_root *root,
5471 struct btrfs_block_group_cache *cache,
5472 u64 val, u64 num_bytes)
5459{ 5473{
5460 u64 mask = ((u64)root->stripesize - 1); 5474 u64 mask;
5461 u64 ret = (val + mask) & ~mask; 5475 u64 ret;
5476 mask = ((u64)root->stripesize - 1);
5477 ret = (val + mask) & ~mask;
5462 return ret; 5478 return ret;
5463} 5479}
5464 5480
@@ -5519,9 +5535,12 @@ int __get_raid_index(u64 flags)
5519 index = 2; 5535 index = 2;
5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5536 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5521 index = 3; 5537 index = 3;
5538 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5539 index = 5;
5540 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5541 index = 6;
5522 else 5542 else
5523 index = 4; 5543 index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */
5524
5525 return index; 5544 return index;
5526} 5545}
5527 5546
@@ -5665,6 +5684,8 @@ search:
5665 if (!block_group_bits(block_group, data)) { 5684 if (!block_group_bits(block_group, data)) {
5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5685 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5667 BTRFS_BLOCK_GROUP_RAID1 | 5686 BTRFS_BLOCK_GROUP_RAID1 |
5687 BTRFS_BLOCK_GROUP_RAID5 |
5688 BTRFS_BLOCK_GROUP_RAID6 |
5668 BTRFS_BLOCK_GROUP_RAID10; 5689 BTRFS_BLOCK_GROUP_RAID10;
5669 5690
5670 /* 5691 /*
@@ -5835,7 +5856,8 @@ unclustered_alloc:
5835 goto loop; 5856 goto loop;
5836 } 5857 }
5837checks: 5858checks:
5838 search_start = stripe_align(root, offset); 5859 search_start = stripe_align(root, used_block_group,
5860 offset, num_bytes);
5839 5861
5840 /* move on to the next group */ 5862 /* move on to the next group */
5841 if (search_start + num_bytes > 5863 if (search_start + num_bytes >
@@ -7203,6 +7225,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7203 root->fs_info->fs_devices->missing_devices; 7225 root->fs_info->fs_devices->missing_devices;
7204 7226
7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7227 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7228 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7206 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7229 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7207 7230
7208 if (num_devices == 1) { 7231 if (num_devices == 1) {
@@ -7754,7 +7777,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7754 btrfs_release_path(path); 7777 btrfs_release_path(path);
7755 cache->flags = btrfs_block_group_flags(&cache->item); 7778 cache->flags = btrfs_block_group_flags(&cache->item);
7756 cache->sectorsize = root->sectorsize; 7779 cache->sectorsize = root->sectorsize;
7757 7780 cache->full_stripe_len = btrfs_full_stripe_len(root,
7781 &root->fs_info->mapping_tree,
7782 found_key.objectid);
7758 btrfs_init_free_space_ctl(cache); 7783 btrfs_init_free_space_ctl(cache);
7759 7784
7760 /* 7785 /*
@@ -7808,6 +7833,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7808 if (!(get_alloc_profile(root, space_info->flags) & 7833 if (!(get_alloc_profile(root, space_info->flags) &
7809 (BTRFS_BLOCK_GROUP_RAID10 | 7834 (BTRFS_BLOCK_GROUP_RAID10 |
7810 BTRFS_BLOCK_GROUP_RAID1 | 7835 BTRFS_BLOCK_GROUP_RAID1 |
7836 BTRFS_BLOCK_GROUP_RAID5 |
7837 BTRFS_BLOCK_GROUP_RAID6 |
7811 BTRFS_BLOCK_GROUP_DUP))) 7838 BTRFS_BLOCK_GROUP_DUP)))
7812 continue; 7839 continue;
7813 /* 7840 /*
@@ -7883,6 +7910,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7910 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7884 cache->sectorsize = root->sectorsize; 7911 cache->sectorsize = root->sectorsize;
7885 cache->fs_info = root->fs_info; 7912 cache->fs_info = root->fs_info;
7913 cache->full_stripe_len = btrfs_full_stripe_len(root,
7914 &root->fs_info->mapping_tree,
7915 chunk_offset);
7886 7916
7887 atomic_set(&cache->count, 1); 7917 atomic_set(&cache->count, 1);
7888 spin_lock_init(&cache->lock); 7918 spin_lock_init(&cache->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 86ecca48c604..3b9fb478b0d1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1895 if (ret)
1896 err = ret; 1896 err = ret;
1897 1897
1898 if (did_repair) { 1898 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1899 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1900 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1901 if (ret && !err)
1902 if (ret && !err) 1902 err = ret;
1903 err = ret;
1904 }
1905 1903
1906 kfree(rec); 1904 kfree(rec);
1907 return err; 1905 return err;
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1930 u64 map_length = 0;
1933 u64 sector; 1931 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1932 struct btrfs_bio *bbio = NULL;
1933 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1934 int ret;
1936 1935
1937 BUG_ON(!mirror_num); 1936 BUG_ON(!mirror_num);
1938 1937
1938 /* we can't repair anything in raid56 yet */
1939 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1940 return 0;
1941
1939 bio = bio_alloc(GFP_NOFS, 1); 1942 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1943 if (!bio)
1941 return -EIO; 1944 return -EIO;
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2055 failrec->failed_mirror);
2053 did_repair = !ret; 2056 did_repair = !ret;
2054 } 2057 }
2058 ret = 0;
2055 } 2059 }
2056 2060
2057out: 2061out:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..62020b7f7036 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1463,10 +1463,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1463} 1463}
1464 1464
1465static struct btrfs_free_space * 1465static struct btrfs_free_space *
1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1466find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1467 unsigned long align)
1467{ 1468{
1468 struct btrfs_free_space *entry; 1469 struct btrfs_free_space *entry;
1469 struct rb_node *node; 1470 struct rb_node *node;
1471 u64 ctl_off;
1472 u64 tmp;
1473 u64 align_off;
1470 int ret; 1474 int ret;
1471 1475
1472 if (!ctl->free_space_offset.rb_node) 1476 if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1485,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1481 if (entry->bytes < *bytes) 1485 if (entry->bytes < *bytes)
1482 continue; 1486 continue;
1483 1487
1488 /* make sure the space returned is big enough
1489 * to match our requested alignment
1490 */
1491 if (*bytes >= align) {
1492 ctl_off = entry->offset - ctl->start;
1493 tmp = ctl_off + align - 1;;
1494 do_div(tmp, align);
1495 tmp = tmp * align + ctl->start;
1496 align_off = tmp - entry->offset;
1497 } else {
1498 align_off = 0;
1499 tmp = entry->offset;
1500 }
1501
1502 if (entry->bytes < *bytes + align_off)
1503 continue;
1504
1484 if (entry->bitmap) { 1505 if (entry->bitmap) {
1485 ret = search_bitmap(ctl, entry, offset, bytes); 1506 ret = search_bitmap(ctl, entry, &tmp, bytes);
1486 if (!ret) 1507 if (!ret) {
1508 *offset = tmp;
1487 return entry; 1509 return entry;
1510 }
1488 continue; 1511 continue;
1489 } 1512 }
1490 1513
1491 *offset = entry->offset; 1514 *offset = tmp;
1492 *bytes = entry->bytes; 1515 *bytes = entry->bytes - align_off;
1493 return entry; 1516 return entry;
1494 } 1517 }
1495 1518
@@ -2091,9 +2114,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2091 struct btrfs_free_space *entry = NULL; 2114 struct btrfs_free_space *entry = NULL;
2092 u64 bytes_search = bytes + empty_size; 2115 u64 bytes_search = bytes + empty_size;
2093 u64 ret = 0; 2116 u64 ret = 0;
2117 u64 align_gap = 0;
2118 u64 align_gap_len = 0;
2094 2119
2095 spin_lock(&ctl->tree_lock); 2120 spin_lock(&ctl->tree_lock);
2096 entry = find_free_space(ctl, &offset, &bytes_search); 2121 entry = find_free_space(ctl, &offset, &bytes_search,
2122 block_group->full_stripe_len);
2097 if (!entry) 2123 if (!entry)
2098 goto out; 2124 goto out;
2099 2125
@@ -2103,9 +2129,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2103 if (!entry->bytes) 2129 if (!entry->bytes)
2104 free_bitmap(ctl, entry); 2130 free_bitmap(ctl, entry);
2105 } else { 2131 } else {
2132
2106 unlink_free_space(ctl, entry); 2133 unlink_free_space(ctl, entry);
2107 entry->offset += bytes; 2134 align_gap_len = offset - entry->offset;
2108 entry->bytes -= bytes; 2135 align_gap = entry->offset;
2136
2137 entry->offset = offset + bytes;
2138 WARN_ON(entry->bytes < bytes + align_gap_len);
2139
2140 entry->bytes -= bytes + align_gap_len;
2109 if (!entry->bytes) 2141 if (!entry->bytes)
2110 kmem_cache_free(btrfs_free_space_cachep, entry); 2142 kmem_cache_free(btrfs_free_space_cachep, entry);
2111 else 2143 else
@@ -2115,6 +2147,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2115out: 2147out:
2116 spin_unlock(&ctl->tree_lock); 2148 spin_unlock(&ctl->tree_lock);
2117 2149
2150 if (align_gap_len)
2151 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2118 return ret; 2152 return ret;
2119} 2153}
2120 2154
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1b98c4ce3c6f..6f4e41dca970 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,6 +39,7 @@
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/blkdev.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -6386,19 +6387,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6386 int async_submit = 0; 6387 int async_submit = 0;
6387 6388
6388 map_length = orig_bio->bi_size; 6389 map_length = orig_bio->bi_size;
6389 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6390 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6390 &map_length, NULL, 0); 6391 &map_length, NULL, 0);
6391 if (ret) { 6392 if (ret) {
6392 bio_put(orig_bio); 6393 bio_put(orig_bio);
6393 return -EIO; 6394 return -EIO;
6394 } 6395 }
6395
6396 if (map_length >= orig_bio->bi_size) { 6396 if (map_length >= orig_bio->bi_size) {
6397 bio = orig_bio; 6397 bio = orig_bio;
6398 goto submit; 6398 goto submit;
6399 } 6399 }
6400 6400
6401 async_submit = 1; 6401 /* async crcs make it difficult to collect full stripe writes. */
6402 if (btrfs_get_alloc_profile(root, 1) &
6403 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
6404 async_submit = 0;
6405 else
6406 async_submit = 1;
6407
6402 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6408 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6403 if (!bio) 6409 if (!bio)
6404 return -ENOMEM; 6410 return -ENOMEM;
@@ -6440,7 +6446,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6440 bio->bi_end_io = btrfs_end_dio_bio; 6446 bio->bi_end_io = btrfs_end_dio_bio;
6441 6447
6442 map_length = orig_bio->bi_size; 6448 map_length = orig_bio->bi_size;
6443 ret = btrfs_map_block(root->fs_info, READ, 6449 ret = btrfs_map_block(root->fs_info, rw,
6444 start_sector << 9, 6450 start_sector << 9,
6445 &map_length, NULL, 0); 6451 &map_length, NULL, 0);
6446 if (ret) { 6452 if (ret) {
@@ -6583,15 +6589,17 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
6583{ 6589{
6584 struct file *file = iocb->ki_filp; 6590 struct file *file = iocb->ki_filp;
6585 struct inode *inode = file->f_mapping->host; 6591 struct inode *inode = file->f_mapping->host;
6592 ssize_t ret;
6586 6593
6587 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, 6594 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6588 offset, nr_segs)) 6595 offset, nr_segs))
6589 return 0; 6596 return 0;
6590 6597
6591 return __blockdev_direct_IO(rw, iocb, inode, 6598 ret = __blockdev_direct_IO(rw, iocb, inode,
6592 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, 6599 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6593 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, 6600 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6594 btrfs_submit_direct, 0); 6601 btrfs_submit_direct, 0);
6602 return ret;
6595} 6603}
6596 6604
6597#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) 6605#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..d02510f34936
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,1647 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50struct btrfs_raid_bio {
51 struct btrfs_fs_info *fs_info;
52 struct btrfs_bio *bbio;
53
54 /*
55 * logical block numbers for the start of each stripe
56 * The last one or two are p/q. These are sorted,
57 * so raid_map[0] is the start of our full stripe
58 */
59 u64 *raid_map;
60
61 /* while we're doing rmw on a stripe
62 * we put it into a hash table so we can
63 * lock the stripe and merge more rbios
64 * into it.
65 */
66 struct list_head hash_list;
67
68 /*
69 * for scheduling work in the helper threads
70 */
71 struct btrfs_work work;
72
73 /*
74 * bio list and bio_list_lock are used
75 * to add more bios into the stripe
76 * in hopes of avoiding the full rmw
77 */
78 struct bio_list bio_list;
79 spinlock_t bio_list_lock;
80
81 /*
82 * also protected by the bio_list_lock, the
83 * stripe locking code uses plug_list to hand off
84 * the stripe lock to the next pending IO
85 */
86 struct list_head plug_list;
87
88 /*
89 * flags that tell us if it is safe to
90 * merge with this bio
91 */
92 unsigned long flags;
93
94 /* size of each individual stripe on disk */
95 int stripe_len;
96
97 /* number of data stripes (no p/q) */
98 int nr_data;
99
100 /*
101 * set if we're doing a parity rebuild
102 * for a read from higher up, which is handled
103 * differently from a parity rebuild as part of
104 * rmw
105 */
106 int read_rebuild;
107
108 /* first bad stripe */
109 int faila;
110
111 /* second bad stripe (for raid6 use) */
112 int failb;
113
114 /*
115 * number of pages needed to represent the full
116 * stripe
117 */
118 int nr_pages;
119
120 /*
121 * size of all the bios in the bio_list. This
122 * helps us decide if the rbio maps to a full
123 * stripe or not
124 */
125 int bio_list_bytes;
126
127 atomic_t refs;
128
129 /*
130 * these are two arrays of pointers. We allocate the
131 * rbio big enough to hold them both and setup their
132 * locations when the rbio is allocated
133 */
134
135 /* pointers to pages that we allocated for
136 * reading/writing stripes directly from the disk (including P/Q)
137 */
138 struct page **stripe_pages;
139
140 /*
141 * pointers to the pages in the bio_list. Stored
142 * here for faster lookup
143 */
144 struct page **bio_pages;
145};
146
147static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
148static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
149static void rmw_work(struct btrfs_work *work);
150static void read_rebuild_work(struct btrfs_work *work);
151static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
152static void async_read_rebuild(struct btrfs_raid_bio *rbio);
153static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
154static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
155static void __free_raid_bio(struct btrfs_raid_bio *rbio);
156static void index_rbio_pages(struct btrfs_raid_bio *rbio);
157static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
158
159/*
160 * the stripe hash table is used for locking, and to collect
161 * bios in hopes of making a full stripe
162 */
163int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
164{
165 struct btrfs_stripe_hash_table *table;
166 struct btrfs_stripe_hash_table *x;
167 struct btrfs_stripe_hash *cur;
168 struct btrfs_stripe_hash *h;
169 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
170 int i;
171
172 if (info->stripe_hash_table)
173 return 0;
174
175 table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
176 if (!table)
177 return -ENOMEM;
178
179 table->table = (void *)(table + 1);
180 h = table->table;
181
182 for (i = 0; i < num_entries; i++) {
183 cur = h + i;
184 INIT_LIST_HEAD(&cur->hash_list);
185 spin_lock_init(&cur->lock);
186 init_waitqueue_head(&cur->wait);
187 }
188
189 x = cmpxchg(&info->stripe_hash_table, NULL, table);
190 if (x)
191 kfree(x);
192 return 0;
193}
194
195/*
196 * we hash on the first logical address of the stripe
197 */
198static int rbio_bucket(struct btrfs_raid_bio *rbio)
199{
200 u64 num = rbio->raid_map[0];
201
202 /*
203 * we shift down quite a bit. We're using byte
204 * addressing, and most of the lower bits are zeros.
205 * This tends to upset hash_64, and it consistently
206 * returns just one or two different values.
207 *
208 * shifting off the lower bits fixes things.
209 */
210 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
211}
212
213/*
214 * merging means we take the bio_list from the victim and
215 * splice it into the destination. The victim should
216 * be discarded afterwards.
217 *
218 * must be called with dest->rbio_list_lock held
219 */
220static void merge_rbio(struct btrfs_raid_bio *dest,
221 struct btrfs_raid_bio *victim)
222{
223 bio_list_merge(&dest->bio_list, &victim->bio_list);
224 dest->bio_list_bytes += victim->bio_list_bytes;
225 bio_list_init(&victim->bio_list);
226}
227
228/*
229 * free the hash table used by unmount
230 */
231void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
232{
233 if (!info->stripe_hash_table)
234 return;
235 kfree(info->stripe_hash_table);
236 info->stripe_hash_table = NULL;
237}
238
239/*
240 * helper function to run the xor_blocks api. It is only
241 * able to do MAX_XOR_BLOCKS at a time, so we need to
242 * loop through.
243 */
244static void run_xor(void **pages, int src_cnt, ssize_t len)
245{
246 int src_off = 0;
247 int xor_src_cnt = 0;
248 void *dest = pages[src_cnt];
249
250 while(src_cnt > 0) {
251 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
252 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
253
254 src_cnt -= xor_src_cnt;
255 src_off += xor_src_cnt;
256 }
257}
258
259/*
260 * returns true if the bio list inside this rbio
261 * covers an entire stripe (no rmw required).
262 * Must be called with the bio list lock held, or
263 * at a time when you know it is impossible to add
264 * new bios into the list
265 */
266static int __rbio_is_full(struct btrfs_raid_bio *rbio)
267{
268 unsigned long size = rbio->bio_list_bytes;
269 int ret = 1;
270
271 if (size != rbio->nr_data * rbio->stripe_len)
272 ret = 0;
273
274 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
275 return ret;
276}
277
278static int rbio_is_full(struct btrfs_raid_bio *rbio)
279{
280 unsigned long flags;
281 int ret;
282
283 spin_lock_irqsave(&rbio->bio_list_lock, flags);
284 ret = __rbio_is_full(rbio);
285 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
286 return ret;
287}
288
289/*
290 * returns 1 if it is safe to merge two rbios together.
291 * The merging is safe if the two rbios correspond to
292 * the same stripe and if they are both going in the same
293 * direction (read vs write), and if neither one is
294 * locked for final IO
295 *
296 * The caller is responsible for locking such that
297 * rmw_locked is safe to test
298 */
299static int rbio_can_merge(struct btrfs_raid_bio *last,
300 struct btrfs_raid_bio *cur)
301{
302 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
303 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
304 return 0;
305
306 if (last->raid_map[0] !=
307 cur->raid_map[0])
308 return 0;
309
310 /* reads can't merge with writes */
311 if (last->read_rebuild !=
312 cur->read_rebuild) {
313 return 0;
314 }
315
316 return 1;
317}
318
319/*
320 * helper to index into the pstripe
321 */
322static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
323{
324 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
325 return rbio->stripe_pages[index];
326}
327
328/*
329 * helper to index into the qstripe, returns null
330 * if there is no qstripe
331 */
332static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
333{
334 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
335 return NULL;
336
337 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
338 PAGE_CACHE_SHIFT;
339 return rbio->stripe_pages[index];
340}
341
342/*
343 * The first stripe in the table for a logical address
344 * has the lock. rbios are added in one of three ways:
345 *
346 * 1) Nobody has the stripe locked yet. The rbio is given
347 * the lock and 0 is returned. The caller must start the IO
348 * themselves.
349 *
350 * 2) Someone has the stripe locked, but we're able to merge
351 * with the lock owner. The rbio is freed and the IO will
352 * start automatically along with the existing rbio. 1 is returned.
353 *
354 * 3) Someone has the stripe locked, but we're not able to merge.
355 * The rbio is added to the lock owner's plug list, or merged into
356 * an rbio already on the plug list. When the lock owner unlocks,
357 * the next rbio on the list is run and the IO is started automatically.
358 * 1 is returned
359 *
360 * If we return 0, the caller still owns the rbio and must continue with
361 * IO submission. If we return 1, the caller must assume the rbio has
362 * already been freed.
363 */
364static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
365{
366 int bucket = rbio_bucket(rbio);
367 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
368 struct btrfs_raid_bio *cur;
369 struct btrfs_raid_bio *pending;
370 unsigned long flags;
371 DEFINE_WAIT(wait);
372 struct btrfs_raid_bio *freeit = NULL;
373 int ret = 0;
374 int walk = 0;
375
376 spin_lock_irqsave(&h->lock, flags);
377 list_for_each_entry(cur, &h->hash_list, hash_list) {
378 walk++;
379 if (cur->raid_map[0] == rbio->raid_map[0]) {
380 spin_lock(&cur->bio_list_lock);
381
382 /* can we merge into the lock owner? */
383 if (rbio_can_merge(cur, rbio)) {
384 merge_rbio(cur, rbio);
385 spin_unlock(&cur->bio_list_lock);
386 freeit = rbio;
387 ret = 1;
388 goto out;
389 }
390
391 /*
392 * we couldn't merge with the running
393 * rbio, see if we can merge with the
394 * pending ones. We don't have to
395 * check for rmw_locked because there
396 * is no way they are inside finish_rmw
397 * right now
398 */
399 list_for_each_entry(pending, &cur->plug_list,
400 plug_list) {
401 if (rbio_can_merge(pending, rbio)) {
402 merge_rbio(pending, rbio);
403 spin_unlock(&cur->bio_list_lock);
404 freeit = rbio;
405 ret = 1;
406 goto out;
407 }
408 }
409
410 /* no merging, put us on the tail of the plug list,
411 * our rbio will be started with the currently
412 * running rbio unlocks
413 */
414 list_add_tail(&rbio->plug_list, &cur->plug_list);
415 spin_unlock(&cur->bio_list_lock);
416 ret = 1;
417 goto out;
418 }
419 }
420
421 atomic_inc(&rbio->refs);
422 list_add(&rbio->hash_list, &h->hash_list);
423out:
424 spin_unlock_irqrestore(&h->lock, flags);
425 if (freeit)
426 __free_raid_bio(freeit);
427 return ret;
428}
429
430/*
431 * called as rmw or parity rebuild is completed. If the plug list has more
432 * rbios waiting for this stripe, the next one on the list will be started
433 */
434static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
435{
436 int bucket;
437 struct btrfs_stripe_hash *h;
438 unsigned long flags;
439
440 bucket = rbio_bucket(rbio);
441 h = rbio->fs_info->stripe_hash_table->table + bucket;
442
443 spin_lock_irqsave(&h->lock, flags);
444 spin_lock(&rbio->bio_list_lock);
445
446 if (!list_empty(&rbio->hash_list)) {
447
448 list_del_init(&rbio->hash_list);
449 atomic_dec(&rbio->refs);
450
451 /*
452 * we use the plug list to hold all the rbios
453 * waiting for the chance to lock this stripe.
454 * hand the lock over to one of them.
455 */
456 if (!list_empty(&rbio->plug_list)) {
457 struct btrfs_raid_bio *next;
458 struct list_head *head = rbio->plug_list.next;
459
460 next = list_entry(head, struct btrfs_raid_bio,
461 plug_list);
462
463 list_del_init(&rbio->plug_list);
464
465 list_add(&next->hash_list, &h->hash_list);
466 atomic_inc(&next->refs);
467 spin_unlock(&rbio->bio_list_lock);
468 spin_unlock_irqrestore(&h->lock, flags);
469
470 if (next->read_rebuild)
471 async_read_rebuild(next);
472 else
473 async_rmw_stripe(next);
474
475 goto done_nolock;
476
477 } else if (waitqueue_active(&h->wait)) {
478 spin_unlock(&rbio->bio_list_lock);
479 spin_unlock_irqrestore(&h->lock, flags);
480 wake_up(&h->wait);
481 goto done_nolock;
482 }
483 }
484 spin_unlock(&rbio->bio_list_lock);
485 spin_unlock_irqrestore(&h->lock, flags);
486
487done_nolock:
488 return;
489}
490
491static void __free_raid_bio(struct btrfs_raid_bio *rbio)
492{
493 int i;
494
495 WARN_ON(atomic_read(&rbio->refs) < 0);
496 if (!atomic_dec_and_test(&rbio->refs))
497 return;
498
499 WARN_ON(!list_empty(&rbio->hash_list));
500 WARN_ON(!bio_list_empty(&rbio->bio_list));
501
502 for (i = 0; i < rbio->nr_pages; i++) {
503 if (rbio->stripe_pages[i]) {
504 __free_page(rbio->stripe_pages[i]);
505 rbio->stripe_pages[i] = NULL;
506 }
507 }
508 kfree(rbio->raid_map);
509 kfree(rbio->bbio);
510 kfree(rbio);
511}
512
513static void free_raid_bio(struct btrfs_raid_bio *rbio)
514{
515 unlock_stripe(rbio);
516 __free_raid_bio(rbio);
517}
518
519/*
520 * this frees the rbio and runs through all the bios in the
521 * bio_list and calls end_io on them
522 */
523static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
524{
525 struct bio *cur = bio_list_get(&rbio->bio_list);
526 struct bio *next;
527 free_raid_bio(rbio);
528
529 while (cur) {
530 next = cur->bi_next;
531 cur->bi_next = NULL;
532 if (uptodate)
533 set_bit(BIO_UPTODATE, &cur->bi_flags);
534 bio_endio(cur, err);
535 cur = next;
536 }
537}
538
539/*
540 * end io function used by finish_rmw. When we finally
541 * get here, we've written a full stripe
542 */
543static void raid_write_end_io(struct bio *bio, int err)
544{
545 struct btrfs_raid_bio *rbio = bio->bi_private;
546
547 if (err)
548 fail_bio_stripe(rbio, bio);
549
550 bio_put(bio);
551
552 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
553 return;
554
555 err = 0;
556
557 /* OK, we have read all the stripes we need to. */
558 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
559 err = -EIO;
560
561 rbio_orig_end_io(rbio, err, 0);
562 return;
563}
564
565/*
566 * the read/modify/write code wants to use the original bio for
567 * any pages it included, and then use the rbio for everything
568 * else. This function decides if a given index (stripe number)
569 * and page number in that stripe fall inside the original bio
570 * or the rbio.
571 *
572 * if you set bio_list_only, you'll get a NULL back for any ranges
573 * that are outside the bio_list
574 *
575 * This doesn't take any refs on anything, you get a bare page pointer
576 * and the caller must bump refs as required.
577 *
578 * You must call index_rbio_pages once before you can trust
579 * the answers from this function.
580 */
581static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
582 int index, int pagenr, int bio_list_only)
583{
584 int chunk_page;
585 struct page *p = NULL;
586
587 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
588
589 spin_lock_irq(&rbio->bio_list_lock);
590 p = rbio->bio_pages[chunk_page];
591 spin_unlock_irq(&rbio->bio_list_lock);
592
593 if (p || bio_list_only)
594 return p;
595
596 return rbio->stripe_pages[chunk_page];
597}
598
599/*
600 * number of pages we need for the entire stripe across all the
601 * drives
602 */
603static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
604{
605 unsigned long nr = stripe_len * nr_stripes;
606 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
607}
608
609/*
610 * allocation and initial setup for the btrfs_raid_bio. Not
611 * this does not allocate any pages for rbio->pages.
612 */
613static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
614 struct btrfs_bio *bbio, u64 *raid_map,
615 u64 stripe_len)
616{
617 struct btrfs_raid_bio *rbio;
618 int nr_data = 0;
619 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
620 void *p;
621
622 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
623 GFP_NOFS);
624 if (!rbio) {
625 kfree(raid_map);
626 kfree(bbio);
627 return ERR_PTR(-ENOMEM);
628 }
629
630 bio_list_init(&rbio->bio_list);
631 INIT_LIST_HEAD(&rbio->plug_list);
632 spin_lock_init(&rbio->bio_list_lock);
633 INIT_LIST_HEAD(&rbio->hash_list);
634 rbio->bbio = bbio;
635 rbio->raid_map = raid_map;
636 rbio->fs_info = root->fs_info;
637 rbio->stripe_len = stripe_len;
638 rbio->nr_pages = num_pages;
639 rbio->faila = -1;
640 rbio->failb = -1;
641 atomic_set(&rbio->refs, 1);
642
643 /*
644 * the stripe_pages and bio_pages array point to the extra
645 * memory we allocated past the end of the rbio
646 */
647 p = rbio + 1;
648 rbio->stripe_pages = p;
649 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
650
651 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
652 nr_data = bbio->num_stripes - 2;
653 else
654 nr_data = bbio->num_stripes - 1;
655
656 rbio->nr_data = nr_data;
657 return rbio;
658}
659
660/* allocate pages for all the stripes in the bio, including parity */
661static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
662{
663 int i;
664 struct page *page;
665
666 for (i = 0; i < rbio->nr_pages; i++) {
667 if (rbio->stripe_pages[i])
668 continue;
669 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
670 if (!page)
671 return -ENOMEM;
672 rbio->stripe_pages[i] = page;
673 ClearPageUptodate(page);
674 }
675 return 0;
676}
677
678/* allocate pages for just the p/q stripes */
679static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
680{
681 int i;
682 struct page *page;
683
684 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
685
686 for (; i < rbio->nr_pages; i++) {
687 if (rbio->stripe_pages[i])
688 continue;
689 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
690 if (!page)
691 return -ENOMEM;
692 rbio->stripe_pages[i] = page;
693 }
694 return 0;
695}
696
697/*
698 * add a single page from a specific stripe into our list of bios for IO
699 * this will try to merge into existing bios if possible, and returns
700 * zero if all went well.
701 */
702int rbio_add_io_page(struct btrfs_raid_bio *rbio,
703 struct bio_list *bio_list,
704 struct page *page,
705 int stripe_nr,
706 unsigned long page_index,
707 unsigned long bio_max_len)
708{
709 struct bio *last = bio_list->tail;
710 u64 last_end = 0;
711 int ret;
712 struct bio *bio;
713 struct btrfs_bio_stripe *stripe;
714 u64 disk_start;
715
716 stripe = &rbio->bbio->stripes[stripe_nr];
717 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
718
719 /* if the device is missing, just fail this stripe */
720 if (!stripe->dev->bdev)
721 return fail_rbio_index(rbio, stripe_nr);
722
723 /* see if we can add this page onto our existing bio */
724 if (last) {
725 last_end = (u64)last->bi_sector << 9;
726 last_end += last->bi_size;
727
728 /*
729 * we can't merge these if they are from different
730 * devices or if they are not contiguous
731 */
732 if (last_end == disk_start && stripe->dev->bdev &&
733 test_bit(BIO_UPTODATE, &last->bi_flags) &&
734 last->bi_bdev == stripe->dev->bdev) {
735 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
736 if (ret == PAGE_CACHE_SIZE)
737 return 0;
738 }
739 }
740
741 /* put a new bio on the list */
742 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
743 if (!bio)
744 return -ENOMEM;
745
746 bio->bi_size = 0;
747 bio->bi_bdev = stripe->dev->bdev;
748 bio->bi_sector = disk_start >> 9;
749 set_bit(BIO_UPTODATE, &bio->bi_flags);
750
751 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
752 bio_list_add(bio_list, bio);
753 return 0;
754}
755
756/*
757 * while we're doing the read/modify/write cycle, we could
758 * have errors in reading pages off the disk. This checks
759 * for errors and if we're not able to read the page it'll
760 * trigger parity reconstruction. The rmw will be finished
761 * after we've reconstructed the failed stripes
762 */
763static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
764{
765 if (rbio->faila >= 0 || rbio->failb >= 0) {
766 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
767 __raid56_parity_recover(rbio);
768 } else {
769 finish_rmw(rbio);
770 }
771}
772
773/*
774 * these are just the pages from the rbio array, not from anything
775 * the FS sent down to us
776 */
777static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
778{
779 int index;
780 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
781 index += page;
782 return rbio->stripe_pages[index];
783}
784
785/*
786 * helper function to walk our bio list and populate the bio_pages array with
787 * the result. This seems expensive, but it is faster than constantly
788 * searching through the bio list as we setup the IO in finish_rmw or stripe
789 * reconstruction.
790 *
791 * This must be called before you trust the answers from page_in_rbio
792 */
793static void index_rbio_pages(struct btrfs_raid_bio *rbio)
794{
795 struct bio *bio;
796 u64 start;
797 unsigned long stripe_offset;
798 unsigned long page_index;
799 struct page *p;
800 int i;
801
802 spin_lock_irq(&rbio->bio_list_lock);
803 bio_list_for_each(bio, &rbio->bio_list) {
804 start = (u64)bio->bi_sector << 9;
805 stripe_offset = start - rbio->raid_map[0];
806 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
807
808 for (i = 0; i < bio->bi_vcnt; i++) {
809 p = bio->bi_io_vec[i].bv_page;
810 rbio->bio_pages[page_index + i] = p;
811 }
812 }
813 spin_unlock_irq(&rbio->bio_list_lock);
814}
815
816/*
817 * this is called from one of two situations. We either
818 * have a full stripe from the higher layers, or we've read all
819 * the missing bits off disk.
820 *
821 * This will calculate the parity and then send down any
822 * changed blocks.
823 */
824static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
825{
826 struct btrfs_bio *bbio = rbio->bbio;
827 void *pointers[bbio->num_stripes];
828 int stripe_len = rbio->stripe_len;
829 int nr_data = rbio->nr_data;
830 int stripe;
831 int pagenr;
832 int p_stripe = -1;
833 int q_stripe = -1;
834 struct bio_list bio_list;
835 struct bio *bio;
836 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
837 int ret;
838
839 bio_list_init(&bio_list);
840
841 if (bbio->num_stripes - rbio->nr_data == 1) {
842 p_stripe = bbio->num_stripes - 1;
843 } else if (bbio->num_stripes - rbio->nr_data == 2) {
844 p_stripe = bbio->num_stripes - 2;
845 q_stripe = bbio->num_stripes - 1;
846 } else {
847 BUG();
848 }
849
850 /* at this point we either have a full stripe,
851 * or we've read the full stripe from the drive.
852 * recalculate the parity and write the new results.
853 *
854 * We're not allowed to add any new bios to the
855 * bio list here, anyone else that wants to
856 * change this stripe needs to do their own rmw.
857 */
858 spin_lock_irq(&rbio->bio_list_lock);
859 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
860 spin_unlock_irq(&rbio->bio_list_lock);
861
862 atomic_set(&rbio->bbio->error, 0);
863
864 /*
865 * now that we've set rmw_locked, run through the
866 * bio list one last time and map the page pointers
867 */
868 index_rbio_pages(rbio);
869
870 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
871 struct page *p;
872 /* first collect one page from each data stripe */
873 for (stripe = 0; stripe < nr_data; stripe++) {
874 p = page_in_rbio(rbio, stripe, pagenr, 0);
875 pointers[stripe] = kmap(p);
876 }
877
878 /* then add the parity stripe */
879 p = rbio_pstripe_page(rbio, pagenr);
880 SetPageUptodate(p);
881 pointers[stripe++] = kmap(p);
882
883 if (q_stripe != -1) {
884
885 /*
886 * raid6, add the qstripe and call the
887 * library function to fill in our p/q
888 */
889 p = rbio_qstripe_page(rbio, pagenr);
890 SetPageUptodate(p);
891 pointers[stripe++] = kmap(p);
892
893 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
894 pointers);
895 } else {
896 /* raid5 */
897 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
898 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
899 }
900
901
902 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
903 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
904 }
905
906 /*
907 * time to start writing. Make bios for everything from the
908 * higher layers (the bio_list in our rbio) and our p/q. Ignore
909 * everything else.
910 */
911 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
912 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
913 struct page *page;
914 if (stripe < rbio->nr_data) {
915 page = page_in_rbio(rbio, stripe, pagenr, 1);
916 if (!page)
917 continue;
918 } else {
919 page = rbio_stripe_page(rbio, stripe, pagenr);
920 }
921
922 ret = rbio_add_io_page(rbio, &bio_list,
923 page, stripe, pagenr, rbio->stripe_len);
924 if (ret)
925 goto cleanup;
926 }
927 }
928
929 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
930 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
931
932 while (1) {
933 bio = bio_list_pop(&bio_list);
934 if (!bio)
935 break;
936
937 bio->bi_private = rbio;
938 bio->bi_end_io = raid_write_end_io;
939 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
940 submit_bio(WRITE, bio);
941 }
942 return;
943
944cleanup:
945 rbio_orig_end_io(rbio, -EIO, 0);
946}
947
948/*
949 * helper to find the stripe number for a given bio. Used to figure out which
950 * stripe has failed. This expects the bio to correspond to a physical disk,
951 * so it looks up based on physical sector numbers.
952 */
953static int find_bio_stripe(struct btrfs_raid_bio *rbio,
954 struct bio *bio)
955{
956 u64 physical = bio->bi_sector;
957 u64 stripe_start;
958 int i;
959 struct btrfs_bio_stripe *stripe;
960
961 physical <<= 9;
962
963 for (i = 0; i < rbio->bbio->num_stripes; i++) {
964 stripe = &rbio->bbio->stripes[i];
965 stripe_start = stripe->physical;
966 if (physical >= stripe_start &&
967 physical < stripe_start + rbio->stripe_len) {
968 return i;
969 }
970 }
971 return -1;
972}
973
974/*
975 * helper to find the stripe number for a given
976 * bio (before mapping). Used to figure out which stripe has
977 * failed. This looks up based on logical block numbers.
978 */
979static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
980 struct bio *bio)
981{
982 u64 logical = bio->bi_sector;
983 u64 stripe_start;
984 int i;
985
986 logical <<= 9;
987
988 for (i = 0; i < rbio->nr_data; i++) {
989 stripe_start = rbio->raid_map[i];
990 if (logical >= stripe_start &&
991 logical < stripe_start + rbio->stripe_len) {
992 return i;
993 }
994 }
995 return -1;
996}
997
998/*
999 * returns -EIO if we had too many failures
1000 */
1001static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1002{
1003 unsigned long flags;
1004 int ret = 0;
1005
1006 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1007
1008 /* we already know this stripe is bad, move on */
1009 if (rbio->faila == failed || rbio->failb == failed)
1010 goto out;
1011
1012 if (rbio->faila == -1) {
1013 /* first failure on this rbio */
1014 rbio->faila = failed;
1015 atomic_inc(&rbio->bbio->error);
1016 } else if (rbio->failb == -1) {
1017 /* second failure on this rbio */
1018 rbio->failb = failed;
1019 atomic_inc(&rbio->bbio->error);
1020 } else {
1021 ret = -EIO;
1022 }
1023out:
1024 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1025
1026 return ret;
1027}
1028
1029/*
1030 * helper to fail a stripe based on a physical disk
1031 * bio.
1032 */
1033static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1034 struct bio *bio)
1035{
1036 int failed = find_bio_stripe(rbio, bio);
1037
1038 if (failed < 0)
1039 return -EIO;
1040
1041 return fail_rbio_index(rbio, failed);
1042}
1043
1044/*
1045 * this sets each page in the bio uptodate. It should only be used on private
1046 * rbio pages, nothing that comes in from the higher layers
1047 */
1048static void set_bio_pages_uptodate(struct bio *bio)
1049{
1050 int i;
1051 struct page *p;
1052
1053 for (i = 0; i < bio->bi_vcnt; i++) {
1054 p = bio->bi_io_vec[i].bv_page;
1055 SetPageUptodate(p);
1056 }
1057}
1058
1059/*
1060 * end io for the read phase of the rmw cycle. All the bios here are physical
1061 * stripe bios we've read from the disk so we can recalculate the parity of the
1062 * stripe.
1063 *
1064 * This will usually kick off finish_rmw once all the bios are read in, but it
1065 * may trigger parity reconstruction if we had any errors along the way
1066 */
1067static void raid_rmw_end_io(struct bio *bio, int err)
1068{
1069 struct btrfs_raid_bio *rbio = bio->bi_private;
1070
1071 if (err)
1072 fail_bio_stripe(rbio, bio);
1073 else
1074 set_bio_pages_uptodate(bio);
1075
1076 bio_put(bio);
1077
1078 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1079 return;
1080
1081 err = 0;
1082 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1083 goto cleanup;
1084
1085 /*
1086 * this will normally call finish_rmw to start our write
1087 * but if there are any failed stripes we'll reconstruct
1088 * from parity first
1089 */
1090 validate_rbio_for_rmw(rbio);
1091 return;
1092
1093cleanup:
1094
1095 rbio_orig_end_io(rbio, -EIO, 0);
1096}
1097
1098static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1099{
1100 rbio->work.flags = 0;
1101 rbio->work.func = rmw_work;
1102
1103 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1104 &rbio->work);
1105}
1106
1107static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1108{
1109 rbio->work.flags = 0;
1110 rbio->work.func = read_rebuild_work;
1111
1112 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1113 &rbio->work);
1114}
1115
1116/*
1117 * the stripe must be locked by the caller. It will
1118 * unlock after all the writes are done
1119 */
1120static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1121{
1122 int bios_to_read = 0;
1123 struct btrfs_bio *bbio = rbio->bbio;
1124 struct bio_list bio_list;
1125 int ret;
1126 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1127 int pagenr;
1128 int stripe;
1129 struct bio *bio;
1130
1131 bio_list_init(&bio_list);
1132
1133 ret = alloc_rbio_pages(rbio);
1134 if (ret)
1135 goto cleanup;
1136
1137 index_rbio_pages(rbio);
1138
1139 atomic_set(&rbio->bbio->error, 0);
1140 /*
1141 * build a list of bios to read all the missing parts of this
1142 * stripe
1143 */
1144 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1145 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1146 struct page *page;
1147 /*
1148 * we want to find all the pages missing from
1149 * the rbio and read them from the disk. If
1150 * page_in_rbio finds a page in the bio list
1151 * we don't need to read it off the stripe.
1152 */
1153 page = page_in_rbio(rbio, stripe, pagenr, 1);
1154 if (page)
1155 continue;
1156
1157 page = rbio_stripe_page(rbio, stripe, pagenr);
1158 ret = rbio_add_io_page(rbio, &bio_list, page,
1159 stripe, pagenr, rbio->stripe_len);
1160 if (ret)
1161 goto cleanup;
1162 }
1163 }
1164
1165 bios_to_read = bio_list_size(&bio_list);
1166 if (!bios_to_read) {
1167 /*
1168 * this can happen if others have merged with
1169 * us, it means there is nothing left to read.
1170 * But if there are missing devices it may not be
1171 * safe to do the full stripe write yet.
1172 */
1173 goto finish;
1174 }
1175
1176 /*
1177 * the bbio may be freed once we submit the last bio. Make sure
1178 * not to touch it after that
1179 */
1180 atomic_set(&bbio->stripes_pending, bios_to_read);
1181 while (1) {
1182 bio = bio_list_pop(&bio_list);
1183 if (!bio)
1184 break;
1185
1186 bio->bi_private = rbio;
1187 bio->bi_end_io = raid_rmw_end_io;
1188
1189 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1190 BTRFS_WQ_ENDIO_RAID56);
1191
1192 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1193 submit_bio(READ, bio);
1194 }
1195 /* the actual write will happen once the reads are done */
1196 return 0;
1197
1198cleanup:
1199 rbio_orig_end_io(rbio, -EIO, 0);
1200 return -EIO;
1201
1202finish:
1203 validate_rbio_for_rmw(rbio);
1204 return 0;
1205}
1206
1207/*
1208 * if the upper layers pass in a full stripe, we thank them by only allocating
1209 * enough pages to hold the parity, and sending it all down quickly.
1210 */
1211static int full_stripe_write(struct btrfs_raid_bio *rbio)
1212{
1213 int ret;
1214
1215 ret = alloc_rbio_parity_pages(rbio);
1216 if (ret)
1217 return ret;
1218
1219 ret = lock_stripe_add(rbio);
1220 if (ret == 0)
1221 finish_rmw(rbio);
1222 return 0;
1223}
1224
1225/*
1226 * partial stripe writes get handed over to async helpers.
1227 * We're really hoping to merge a few more writes into this
1228 * rbio before calculating new parity
1229 */
1230static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1231{
1232 int ret;
1233
1234 ret = lock_stripe_add(rbio);
1235 if (ret == 0)
1236 async_rmw_stripe(rbio);
1237 return 0;
1238}
1239
1240/*
1241 * sometimes while we were reading from the drive to
1242 * recalculate parity, enough new bios come into create
1243 * a full stripe. So we do a check here to see if we can
1244 * go directly to finish_rmw
1245 */
1246static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1247{
1248 /* head off into rmw land if we don't have a full stripe */
1249 if (!rbio_is_full(rbio))
1250 return partial_stripe_write(rbio);
1251 return full_stripe_write(rbio);
1252}
1253
1254/*
1255 * our main entry point for writes from the rest of the FS.
1256 */
1257int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1258 struct btrfs_bio *bbio, u64 *raid_map,
1259 u64 stripe_len)
1260{
1261 struct btrfs_raid_bio *rbio;
1262
1263 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1264 if (IS_ERR(rbio)) {
1265 kfree(raid_map);
1266 kfree(bbio);
1267 return PTR_ERR(rbio);
1268 }
1269 bio_list_add(&rbio->bio_list, bio);
1270 rbio->bio_list_bytes = bio->bi_size;
1271 return __raid56_parity_write(rbio);
1272}
1273
1274/*
1275 * all parity reconstruction happens here. We've read in everything
1276 * we can find from the drives and this does the heavy lifting of
1277 * sorting the good from the bad.
1278 */
1279static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1280{
1281 int pagenr, stripe;
1282 void **pointers;
1283 int faila = -1, failb = -1;
1284 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1285 struct page *page;
1286 int err;
1287 int i;
1288
1289 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1290 GFP_NOFS);
1291 if (!pointers) {
1292 err = -ENOMEM;
1293 goto cleanup_io;
1294 }
1295
1296 faila = rbio->faila;
1297 failb = rbio->failb;
1298
1299 if (rbio->read_rebuild) {
1300 spin_lock_irq(&rbio->bio_list_lock);
1301 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1302 spin_unlock_irq(&rbio->bio_list_lock);
1303 }
1304
1305 index_rbio_pages(rbio);
1306
1307 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1308 /* setup our array of pointers with pages
1309 * from each stripe
1310 */
1311 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1312 /*
1313 * if we're rebuilding a read, we have to use
1314 * pages from the bio list
1315 */
1316 if (rbio->read_rebuild &&
1317 (stripe == faila || stripe == failb)) {
1318 page = page_in_rbio(rbio, stripe, pagenr, 0);
1319 } else {
1320 page = rbio_stripe_page(rbio, stripe, pagenr);
1321 }
1322 pointers[stripe] = kmap(page);
1323 }
1324
1325 /* all raid6 handling here */
1326 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1327 RAID6_Q_STRIPE) {
1328
1329 /*
1330 * single failure, rebuild from parity raid5
1331 * style
1332 */
1333 if (failb < 0) {
1334 if (faila == rbio->nr_data) {
1335 /*
1336 * Just the P stripe has failed, without
1337 * a bad data or Q stripe.
1338 * TODO, we should redo the xor here.
1339 */
1340 err = -EIO;
1341 goto cleanup;
1342 }
1343 /*
1344 * a single failure in raid6 is rebuilt
1345 * in the pstripe code below
1346 */
1347 goto pstripe;
1348 }
1349
1350 /* make sure our ps and qs are in order */
1351 if (faila > failb) {
1352 int tmp = failb;
1353 failb = faila;
1354 faila = tmp;
1355 }
1356
1357 /* if the q stripe is failed, do a pstripe reconstruction
1358 * from the xors.
1359 * If both the q stripe and the P stripe are failed, we're
1360 * here due to a crc mismatch and we can't give them the
1361 * data they want
1362 */
1363 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1364 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1365 err = -EIO;
1366 goto cleanup;
1367 }
1368 /*
1369 * otherwise we have one bad data stripe and
1370 * a good P stripe. raid5!
1371 */
1372 goto pstripe;
1373 }
1374
1375 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1376 raid6_datap_recov(rbio->bbio->num_stripes,
1377 PAGE_SIZE, faila, pointers);
1378 } else {
1379 raid6_2data_recov(rbio->bbio->num_stripes,
1380 PAGE_SIZE, faila, failb,
1381 pointers);
1382 }
1383 } else {
1384 void *p;
1385
1386 /* rebuild from P stripe here (raid5 or raid6) */
1387 BUG_ON(failb != -1);
1388pstripe:
1389 /* Copy parity block into failed block to start with */
1390 memcpy(pointers[faila],
1391 pointers[rbio->nr_data],
1392 PAGE_CACHE_SIZE);
1393
1394 /* rearrange the pointer array */
1395 p = pointers[faila];
1396 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1397 pointers[stripe] = pointers[stripe + 1];
1398 pointers[rbio->nr_data - 1] = p;
1399
1400 /* xor in the rest */
1401 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1402 }
1403 /* if we're doing this rebuild as part of an rmw, go through
1404 * and set all of our private rbio pages in the
1405 * failed stripes as uptodate. This way finish_rmw will
1406 * know they can be trusted. If this was a read reconstruction,
1407 * other endio functions will fiddle the uptodate bits
1408 */
1409 if (!rbio->read_rebuild) {
1410 for (i = 0; i < nr_pages; i++) {
1411 if (faila != -1) {
1412 page = rbio_stripe_page(rbio, faila, i);
1413 SetPageUptodate(page);
1414 }
1415 if (failb != -1) {
1416 page = rbio_stripe_page(rbio, failb, i);
1417 SetPageUptodate(page);
1418 }
1419 }
1420 }
1421 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1422 /*
1423 * if we're rebuilding a read, we have to use
1424 * pages from the bio list
1425 */
1426 if (rbio->read_rebuild &&
1427 (stripe == faila || stripe == failb)) {
1428 page = page_in_rbio(rbio, stripe, pagenr, 0);
1429 } else {
1430 page = rbio_stripe_page(rbio, stripe, pagenr);
1431 }
1432 kunmap(page);
1433 }
1434 }
1435
1436 err = 0;
1437cleanup:
1438 kfree(pointers);
1439
1440cleanup_io:
1441
1442 if (rbio->read_rebuild) {
1443 rbio_orig_end_io(rbio, err, err == 0);
1444 } else if (err == 0) {
1445 rbio->faila = -1;
1446 rbio->failb = -1;
1447 finish_rmw(rbio);
1448 } else {
1449 rbio_orig_end_io(rbio, err, 0);
1450 }
1451}
1452
1453/*
1454 * This is called only for stripes we've read from disk to
1455 * reconstruct the parity.
1456 */
1457static void raid_recover_end_io(struct bio *bio, int err)
1458{
1459 struct btrfs_raid_bio *rbio = bio->bi_private;
1460
1461 /*
1462 * we only read stripe pages off the disk, set them
1463 * up to date if there were no errors
1464 */
1465 if (err)
1466 fail_bio_stripe(rbio, bio);
1467 else
1468 set_bio_pages_uptodate(bio);
1469 bio_put(bio);
1470
1471 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1472 return;
1473
1474 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1475 rbio_orig_end_io(rbio, -EIO, 0);
1476 else
1477 __raid_recover_end_io(rbio);
1478}
1479
1480/*
1481 * reads everything we need off the disk to reconstruct
1482 * the parity. endio handlers trigger final reconstruction
1483 * when the IO is done.
1484 *
1485 * This is used both for reads from the higher layers and for
1486 * parity construction required to finish a rmw cycle.
1487 */
1488static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1489{
1490 int bios_to_read = 0;
1491 struct btrfs_bio *bbio = rbio->bbio;
1492 struct bio_list bio_list;
1493 int ret;
1494 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1495 int pagenr;
1496 int stripe;
1497 struct bio *bio;
1498
1499 bio_list_init(&bio_list);
1500
1501 ret = alloc_rbio_pages(rbio);
1502 if (ret)
1503 goto cleanup;
1504
1505 atomic_set(&rbio->bbio->error, 0);
1506
1507 /*
1508 * read everything that hasn't failed.
1509 */
1510 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1511 if (rbio->faila == stripe ||
1512 rbio->failb == stripe)
1513 continue;
1514
1515 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1516 struct page *p;
1517
1518 /*
1519 * the rmw code may have already read this
1520 * page in
1521 */
1522 p = rbio_stripe_page(rbio, stripe, pagenr);
1523 if (PageUptodate(p))
1524 continue;
1525
1526 ret = rbio_add_io_page(rbio, &bio_list,
1527 rbio_stripe_page(rbio, stripe, pagenr),
1528 stripe, pagenr, rbio->stripe_len);
1529 if (ret < 0)
1530 goto cleanup;
1531 }
1532 }
1533
1534 bios_to_read = bio_list_size(&bio_list);
1535 if (!bios_to_read) {
1536 /*
1537 * we might have no bios to read just because the pages
1538 * were up to date, or we might have no bios to read because
1539 * the devices were gone.
1540 */
1541 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1542 __raid_recover_end_io(rbio);
1543 goto out;
1544 } else {
1545 goto cleanup;
1546 }
1547 }
1548
1549 /*
1550 * the bbio may be freed once we submit the last bio. Make sure
1551 * not to touch it after that
1552 */
1553 atomic_set(&bbio->stripes_pending, bios_to_read);
1554 while (1) {
1555 bio = bio_list_pop(&bio_list);
1556 if (!bio)
1557 break;
1558
1559 bio->bi_private = rbio;
1560 bio->bi_end_io = raid_recover_end_io;
1561
1562 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1563 BTRFS_WQ_ENDIO_RAID56);
1564
1565 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1566 submit_bio(READ, bio);
1567 }
1568out:
1569 return 0;
1570
1571cleanup:
1572 if (rbio->read_rebuild)
1573 rbio_orig_end_io(rbio, -EIO, 0);
1574 return -EIO;
1575}
1576
1577/*
1578 * the main entry point for reads from the higher layers. This
1579 * is really only called when the normal read path had a failure,
1580 * so we assume the bio they send down corresponds to a failed part
1581 * of the drive.
1582 */
1583int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
1584 struct btrfs_bio *bbio, u64 *raid_map,
1585 u64 stripe_len, int mirror_num)
1586{
1587 struct btrfs_raid_bio *rbio;
1588 int ret;
1589
1590 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1591 if (IS_ERR(rbio)) {
1592 return PTR_ERR(rbio);
1593 }
1594
1595 rbio->read_rebuild = 1;
1596 bio_list_add(&rbio->bio_list, bio);
1597 rbio->bio_list_bytes = bio->bi_size;
1598
1599 rbio->faila = find_logical_bio_stripe(rbio, bio);
1600 if (rbio->faila == -1) {
1601 BUG();
1602 kfree(rbio);
1603 return -EIO;
1604 }
1605
1606 /*
1607 * reconstruct from the q stripe if they are
1608 * asking for mirror 3
1609 */
1610 if (mirror_num == 3)
1611 rbio->failb = bbio->num_stripes - 2;
1612
1613 ret = lock_stripe_add(rbio);
1614
1615 /*
1616 * __raid56_parity_recover will end the bio with
1617 * any errors it hits. We don't want to return
1618 * its error value up the stack because our caller
1619 * will end up calling bio_endio with any nonzero
1620 * return
1621 */
1622 if (ret == 0)
1623 __raid56_parity_recover(rbio);
1624 /*
1625 * our rbio has been added to the list of
1626 * rbios that will be handled after the
1627 * currently lock owner is done
1628 */
1629 return 0;
1630
1631}
1632
1633static void rmw_work(struct btrfs_work *work)
1634{
1635 struct btrfs_raid_bio *rbio;
1636
1637 rbio = container_of(work, struct btrfs_raid_bio, work);
1638 raid56_rmw_stripe(rbio);
1639}
1640
1641static void read_rebuild_work(struct btrfs_work *work)
1642{
1643 struct btrfs_raid_bio *rbio;
1644
1645 rbio = container_of(work, struct btrfs_raid_bio, work);
1646 __raid56_parity_recover(rbio);
1647}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..bc35ed4238b8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -2246,6 +2247,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2246 struct btrfs_device *extent_dev; 2247 struct btrfs_device *extent_dev;
2247 int extent_mirror_num; 2248 int extent_mirror_num;
2248 2249
2250 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2251 BTRFS_BLOCK_GROUP_RAID6)) {
2252 if (num >= nr_data_stripes(map)) {
2253 return 0;
2254 }
2255 }
2256
2249 nstripes = length; 2257 nstripes = length;
2250 offset = 0; 2258 offset = 0;
2251 do_div(nstripes, map->stripe_len); 2259 do_div(nstripes, map->stripe_len);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..a065dec0e330 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -686,7 +686,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
686 struct extent_state *cached_state = NULL; 686 struct extent_state *cached_state = NULL;
687 u64 start = 0; 687 u64 start = 0;
688 u64 end; 688 u64 end;
689 struct blk_plug plug;
689 690
691 blk_start_plug(&plug);
690 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 692 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
691 mark, &cached_state)) { 693 mark, &cached_state)) {
692 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 694 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -700,6 +702,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
700 } 702 }
701 if (err) 703 if (err)
702 werr = err; 704 werr = err;
705 blk_finish_plug(&plug);
703 return werr; 706 return werr;
704} 707}
705 708
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 485a5423e3c6..c372264b85bf 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -1389,6 +1392,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1389 } 1392 }
1390 btrfs_dev_replace_unlock(&root->fs_info->dev_replace); 1393 btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1391 1394
1395 if ((all_avail & (BTRFS_BLOCK_GROUP_RAID5 |
1396 BTRFS_BLOCK_GROUP_RAID6) && num_devices <= 3)) {
1397 printk(KERN_ERR "btrfs: unable to go below three devices "
1398 "on raid5 or raid6\n");
1399 ret = -EINVAL;
1400 goto out;
1401 }
1402
1392 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1403 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1393 printk(KERN_ERR "btrfs: unable to go below four devices " 1404 printk(KERN_ERR "btrfs: unable to go below four devices "
1394 "on raid10\n"); 1405 "on raid10\n");
@@ -1403,6 +1414,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1403 goto out; 1414 goto out;
1404 } 1415 }
1405 1416
1417 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1418 root->fs_info->fs_devices->rw_devices <= 2) {
1419 printk(KERN_ERR "btrfs: unable to go below two "
1420 "devices on raid5\n");
1421 ret = -EINVAL;
1422 goto out;
1423 }
1424 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1425 root->fs_info->fs_devices->rw_devices <= 3) {
1426 printk(KERN_ERR "btrfs: unable to go below three "
1427 "devices on raid6\n");
1428 ret = -EINVAL;
1429 goto out;
1430 }
1431
1406 if (strcmp(device_path, "missing") == 0) { 1432 if (strcmp(device_path, "missing") == 0) {
1407 struct list_head *devices; 1433 struct list_head *devices;
1408 struct btrfs_device *tmp; 1434 struct btrfs_device *tmp;
@@ -2657,11 +2683,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2657 return 0; 2683 return 0;
2658 2684
2659 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2685 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2660 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2686 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2661 factor = 2; 2687 factor = num_stripes / 2;
2662 else 2688 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2663 factor = 1; 2689 factor = num_stripes - 1;
2664 factor = num_stripes / factor; 2690 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2691 factor = num_stripes - 2;
2692 } else {
2693 factor = num_stripes;
2694 }
2665 2695
2666 for (i = 0; i < num_stripes; i++) { 2696 for (i = 0; i < num_stripes; i++) {
2667 stripe = btrfs_stripe_nr(chunk, i); 2697 stripe = btrfs_stripe_nr(chunk, i);
@@ -2976,6 +3006,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
2976 int mixed = 0; 3006 int mixed = 0;
2977 int ret; 3007 int ret;
2978 u64 num_devices; 3008 u64 num_devices;
3009 int cancel = 0;
2979 3010
2980 if (btrfs_fs_closing(fs_info) || 3011 if (btrfs_fs_closing(fs_info) ||
2981 atomic_read(&fs_info->balance_pause_req) || 3012 atomic_read(&fs_info->balance_pause_req) ||
@@ -3018,7 +3049,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3018 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3049 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3019 else 3050 else
3020 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3051 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3021 BTRFS_BLOCK_GROUP_RAID10); 3052 BTRFS_BLOCK_GROUP_RAID10 |
3053 BTRFS_BLOCK_GROUP_RAID5 |
3054 BTRFS_BLOCK_GROUP_RAID6);
3022 3055
3023 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3056 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3024 (!alloc_profile_is_valid(bctl->data.target, 1) || 3057 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3058,7 +3091,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3058 3091
3059 /* allow to reduce meta or sys integrity only if force set */ 3092 /* allow to reduce meta or sys integrity only if force set */
3060 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3093 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3061 BTRFS_BLOCK_GROUP_RAID10; 3094 BTRFS_BLOCK_GROUP_RAID10 |
3095 BTRFS_BLOCK_GROUP_RAID5 |
3096 BTRFS_BLOCK_GROUP_RAID6;
3097
3062 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3098 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3063 (fs_info->avail_system_alloc_bits & allowed) && 3099 (fs_info->avail_system_alloc_bits & allowed) &&
3064 !(bctl->sys.target & allowed)) || 3100 !(bctl->sys.target & allowed)) ||
@@ -3124,15 +3160,17 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3124 } 3160 }
3125 3161
3126 if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 3162 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3127 balance_need_close(fs_info)) { 3163 balance_need_close(fs_info))
3128 __cancel_balance(fs_info); 3164 cancel = 1;
3129 }
3130 3165
3131 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { 3166 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3132 fs_info->num_tolerated_disk_barrier_failures = 3167 fs_info->num_tolerated_disk_barrier_failures =
3133 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); 3168 btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3134 } 3169 }
3135 3170
3171 if (cancel)
3172 __cancel_balance(fs_info);
3173
3136 wake_up(&fs_info->balance_wait_q); 3174 wake_up(&fs_info->balance_wait_q);
3137 3175
3138 return ret; 3176 return ret;
@@ -3493,13 +3531,45 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
3493} 3531}
3494 3532
3495struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 3533struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3534 /*
3535 * sub_stripes info for map,
3536 * dev_stripes -- stripes per dev, 2 for DUP, 1 other wise
3537 * devs_max -- max devices per stripe, 0 for unlimited
3538 * devs_min -- min devices per stripe
3539 * devs_increment -- ndevs must be a multiple of this
3540 * ncopies -- how many copies of the data we have
3541 */
3496 { 2, 1, 0, 4, 2, 2 /* raid10 */ }, 3542 { 2, 1, 0, 4, 2, 2 /* raid10 */ },
3497 { 1, 1, 2, 2, 2, 2 /* raid1 */ }, 3543 { 1, 1, 2, 2, 2, 2 /* raid1 */ },
3498 { 1, 2, 1, 1, 1, 2 /* dup */ }, 3544 { 1, 2, 1, 1, 1, 2 /* dup */ },
3499 { 1, 1, 0, 2, 1, 1 /* raid0 */ }, 3545 { 1, 1, 0, 2, 1, 1 /* raid0 */ },
3500 { 1, 1, 0, 1, 1, 1 /* single */ }, 3546 { 1, 1, 0, 1, 1, 1 /* single */ },
3547 { 1, 1, 0, 2, 1, 2 /* raid5 */ },
3548 { 1, 1, 0, 3, 1, 3 /* raid6 */ },
3501}; 3549};
3502 3550
3551static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3552{
3553 /* TODO allow them to set a preferred stripe size */
3554 return 64 * 1024;
3555}
3556
3557static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3558{
3559 u64 features;
3560
3561 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3562 return;
3563
3564 features = btrfs_super_incompat_flags(info->super_copy);
3565 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3566 return;
3567
3568 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3569 btrfs_set_super_incompat_flags(info->super_copy, features);
3570 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3571}
3572
3503static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3573static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *extent_root, 3574 struct btrfs_root *extent_root,
3505 struct map_lookup **map_ret, 3575 struct map_lookup **map_ret,
@@ -3515,6 +3585,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3515 struct btrfs_device_info *devices_info = NULL; 3585 struct btrfs_device_info *devices_info = NULL;
3516 u64 total_avail; 3586 u64 total_avail;
3517 int num_stripes; /* total number of stripes to allocate */ 3587 int num_stripes; /* total number of stripes to allocate */
3588 int data_stripes; /* number of stripes that count for
3589 block group size */
3518 int sub_stripes; /* sub_stripes info for map */ 3590 int sub_stripes; /* sub_stripes info for map */
3519 int dev_stripes; /* stripes per dev */ 3591 int dev_stripes; /* stripes per dev */
3520 int devs_max; /* max devs to use */ 3592 int devs_max; /* max devs to use */
@@ -3526,6 +3598,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3526 u64 max_chunk_size; 3598 u64 max_chunk_size;
3527 u64 stripe_size; 3599 u64 stripe_size;
3528 u64 num_bytes; 3600 u64 num_bytes;
3601 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3529 int ndevs; 3602 int ndevs;
3530 int i; 3603 int i;
3531 int j; 3604 int j;
@@ -3651,16 +3724,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3651 stripe_size = devices_info[ndevs-1].max_avail; 3724 stripe_size = devices_info[ndevs-1].max_avail;
3652 num_stripes = ndevs * dev_stripes; 3725 num_stripes = ndevs * dev_stripes;
3653 3726
3727 /*
3728 * this will have to be fixed for RAID1 and RAID10 over
3729 * more drives
3730 */
3731 data_stripes = num_stripes / ncopies;
3732
3654 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3733 if (stripe_size * ndevs > max_chunk_size * ncopies) {
3655 stripe_size = max_chunk_size * ncopies; 3734 stripe_size = max_chunk_size * ncopies;
3656 do_div(stripe_size, ndevs); 3735 do_div(stripe_size, ndevs);
3657 } 3736 }
3658 3737 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3738 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3739 btrfs_super_stripesize(info->super_copy));
3740 data_stripes = num_stripes - 1;
3741 }
3742 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3743 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3744 btrfs_super_stripesize(info->super_copy));
3745 data_stripes = num_stripes - 2;
3746 }
3659 do_div(stripe_size, dev_stripes); 3747 do_div(stripe_size, dev_stripes);
3660 3748
3661 /* align to BTRFS_STRIPE_LEN */ 3749 /* align to BTRFS_STRIPE_LEN */
3662 do_div(stripe_size, BTRFS_STRIPE_LEN); 3750 do_div(stripe_size, raid_stripe_len);
3663 stripe_size *= BTRFS_STRIPE_LEN; 3751 stripe_size *= raid_stripe_len;
3664 3752
3665 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3753 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3666 if (!map) { 3754 if (!map) {
@@ -3678,14 +3766,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3678 } 3766 }
3679 } 3767 }
3680 map->sector_size = extent_root->sectorsize; 3768 map->sector_size = extent_root->sectorsize;
3681 map->stripe_len = BTRFS_STRIPE_LEN; 3769 map->stripe_len = raid_stripe_len;
3682 map->io_align = BTRFS_STRIPE_LEN; 3770 map->io_align = raid_stripe_len;
3683 map->io_width = BTRFS_STRIPE_LEN; 3771 map->io_width = raid_stripe_len;
3684 map->type = type; 3772 map->type = type;
3685 map->sub_stripes = sub_stripes; 3773 map->sub_stripes = sub_stripes;
3686 3774
3687 *map_ret = map; 3775 *map_ret = map;
3688 num_bytes = stripe_size * (num_stripes / ncopies); 3776 num_bytes = stripe_size * data_stripes;
3689 3777
3690 *stripe_size_out = stripe_size; 3778 *stripe_size_out = stripe_size;
3691 *num_bytes_out = num_bytes; 3779 *num_bytes_out = num_bytes;
@@ -3734,6 +3822,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3734 } 3822 }
3735 } 3823 }
3736 3824
3825 check_raid56_incompat_flag(extent_root->fs_info, type);
3826
3737 kfree(devices_info); 3827 kfree(devices_info);
3738 return 0; 3828 return 0;
3739 3829
@@ -4003,6 +4093,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4003 ret = map->num_stripes; 4093 ret = map->num_stripes;
4004 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4094 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4005 ret = map->sub_stripes; 4095 ret = map->sub_stripes;
4096 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4097 ret = 2;
4098 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4099 ret = 3;
4006 else 4100 else
4007 ret = 1; 4101 ret = 1;
4008 free_extent_map(em); 4102 free_extent_map(em);
@@ -4015,6 +4109,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4015 return ret; 4109 return ret;
4016} 4110}
4017 4111
4112unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4113 struct btrfs_mapping_tree *map_tree,
4114 u64 logical)
4115{
4116 struct extent_map *em;
4117 struct map_lookup *map;
4118 struct extent_map_tree *em_tree = &map_tree->map_tree;
4119 unsigned long len = root->sectorsize;
4120
4121 read_lock(&em_tree->lock);
4122 em = lookup_extent_mapping(em_tree, logical, len);
4123 read_unlock(&em_tree->lock);
4124 BUG_ON(!em);
4125
4126 BUG_ON(em->start > logical || em->start + em->len < logical);
4127 map = (struct map_lookup *)em->bdev;
4128 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4129 BTRFS_BLOCK_GROUP_RAID6)) {
4130 len = map->stripe_len * nr_data_stripes(map);
4131 }
4132 free_extent_map(em);
4133 return len;
4134}
4135
4136int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4137 u64 logical, u64 len, int mirror_num)
4138{
4139 struct extent_map *em;
4140 struct map_lookup *map;
4141 struct extent_map_tree *em_tree = &map_tree->map_tree;
4142 int ret = 0;
4143
4144 read_lock(&em_tree->lock);
4145 em = lookup_extent_mapping(em_tree, logical, len);
4146 read_unlock(&em_tree->lock);
4147 BUG_ON(!em);
4148
4149 BUG_ON(em->start > logical || em->start + em->len < logical);
4150 map = (struct map_lookup *)em->bdev;
4151 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4152 BTRFS_BLOCK_GROUP_RAID6))
4153 ret = 1;
4154 free_extent_map(em);
4155 return ret;
4156}
4157
4018static int find_live_mirror(struct btrfs_fs_info *fs_info, 4158static int find_live_mirror(struct btrfs_fs_info *fs_info,
4019 struct map_lookup *map, int first, int num, 4159 struct map_lookup *map, int first, int num,
4020 int optimal, int dev_replace_is_ongoing) 4160 int optimal, int dev_replace_is_ongoing)
@@ -4052,10 +4192,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4052 return optimal; 4192 return optimal;
4053} 4193}
4054 4194
4195static inline int parity_smaller(u64 a, u64 b)
4196{
4197 return a > b;
4198}
4199
4200/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4201static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4202{
4203 struct btrfs_bio_stripe s;
4204 int i;
4205 u64 l;
4206 int again = 1;
4207
4208 while (again) {
4209 again = 0;
4210 for (i = 0; i < bbio->num_stripes - 1; i++) {
4211 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4212 s = bbio->stripes[i];
4213 l = raid_map[i];
4214 bbio->stripes[i] = bbio->stripes[i+1];
4215 raid_map[i] = raid_map[i+1];
4216 bbio->stripes[i+1] = s;
4217 raid_map[i+1] = l;
4218 again = 1;
4219 }
4220 }
4221 }
4222}
4223
4055static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4224static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4056 u64 logical, u64 *length, 4225 u64 logical, u64 *length,
4057 struct btrfs_bio **bbio_ret, 4226 struct btrfs_bio **bbio_ret,
4058 int mirror_num) 4227 int mirror_num, u64 **raid_map_ret)
4059{ 4228{
4060 struct extent_map *em; 4229 struct extent_map *em;
4061 struct map_lookup *map; 4230 struct map_lookup *map;
@@ -4067,6 +4236,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4067 u64 stripe_nr; 4236 u64 stripe_nr;
4068 u64 stripe_nr_orig; 4237 u64 stripe_nr_orig;
4069 u64 stripe_nr_end; 4238 u64 stripe_nr_end;
4239 u64 stripe_len;
4240 u64 *raid_map = NULL;
4070 int stripe_index; 4241 int stripe_index;
4071 int i; 4242 int i;
4072 int ret = 0; 4243 int ret = 0;
@@ -4078,6 +4249,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4078 int num_alloc_stripes; 4249 int num_alloc_stripes;
4079 int patch_the_first_stripe_for_dev_replace = 0; 4250 int patch_the_first_stripe_for_dev_replace = 0;
4080 u64 physical_to_patch_in_first_stripe = 0; 4251 u64 physical_to_patch_in_first_stripe = 0;
4252 u64 raid56_full_stripe_start = (u64)-1;
4081 4253
4082 read_lock(&em_tree->lock); 4254 read_lock(&em_tree->lock);
4083 em = lookup_extent_mapping(em_tree, logical, *length); 4255 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4094,29 +4266,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4094 map = (struct map_lookup *)em->bdev; 4266 map = (struct map_lookup *)em->bdev;
4095 offset = logical - em->start; 4267 offset = logical - em->start;
4096 4268
4269 if (mirror_num > map->num_stripes)
4270 mirror_num = 0;
4271
4272 stripe_len = map->stripe_len;
4097 stripe_nr = offset; 4273 stripe_nr = offset;
4098 /* 4274 /*
4099 * stripe_nr counts the total number of stripes we have to stride 4275 * stripe_nr counts the total number of stripes we have to stride
4100 * to get to this block 4276 * to get to this block
4101 */ 4277 */
4102 do_div(stripe_nr, map->stripe_len); 4278 do_div(stripe_nr, stripe_len);
4103 4279
4104 stripe_offset = stripe_nr * map->stripe_len; 4280 stripe_offset = stripe_nr * stripe_len;
4105 BUG_ON(offset < stripe_offset); 4281 BUG_ON(offset < stripe_offset);
4106 4282
4107 /* stripe_offset is the offset of this block in its stripe*/ 4283 /* stripe_offset is the offset of this block in its stripe*/
4108 stripe_offset = offset - stripe_offset; 4284 stripe_offset = offset - stripe_offset;
4109 4285
4110 if (rw & REQ_DISCARD) 4286 /* if we're here for raid56, we need to know the stripe aligned start */
4287 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4288 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4289 raid56_full_stripe_start = offset;
4290
4291 /* allow a write of a full stripe, but make sure we don't
4292 * allow straddling of stripes
4293 */
4294 do_div(raid56_full_stripe_start, full_stripe_len);
4295 raid56_full_stripe_start *= full_stripe_len;
4296 }
4297
4298 if (rw & REQ_DISCARD) {
4299 /* we don't discard raid56 yet */
4300 if (map->type &
4301 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4302 ret = -EOPNOTSUPP;
4303 goto out;
4304 }
4111 *length = min_t(u64, em->len - offset, *length); 4305 *length = min_t(u64, em->len - offset, *length);
4112 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4306 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4113 /* we limit the length of each bio to what fits in a stripe */ 4307 u64 max_len;
4114 *length = min_t(u64, em->len - offset, 4308 /* For writes to RAID[56], allow a full stripeset across all disks.
4115 map->stripe_len - stripe_offset); 4309 For other RAID types and for RAID[56] reads, just allow a single
4310 stripe (on a single disk). */
4311 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4312 (rw & REQ_WRITE)) {
4313 max_len = stripe_len * nr_data_stripes(map) -
4314 (offset - raid56_full_stripe_start);
4315 } else {
4316 /* we limit the length of each bio to what fits in a stripe */
4317 max_len = stripe_len - stripe_offset;
4318 }
4319 *length = min_t(u64, em->len - offset, max_len);
4116 } else { 4320 } else {
4117 *length = em->len - offset; 4321 *length = em->len - offset;
4118 } 4322 }
4119 4323
4324 /* This is for when we're called from btrfs_merge_bio_hook() and all
4325 it cares about is the length */
4120 if (!bbio_ret) 4326 if (!bbio_ret)
4121 goto out; 4327 goto out;
4122 4328
@@ -4149,7 +4355,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4149 u64 physical_of_found = 0; 4355 u64 physical_of_found = 0;
4150 4356
4151 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4357 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4152 logical, &tmp_length, &tmp_bbio, 0); 4358 logical, &tmp_length, &tmp_bbio, 0, NULL);
4153 if (ret) { 4359 if (ret) {
4154 WARN_ON(tmp_bbio != NULL); 4360 WARN_ON(tmp_bbio != NULL);
4155 goto out; 4361 goto out;
@@ -4215,6 +4421,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4215 do_div(stripe_nr_end, map->stripe_len); 4421 do_div(stripe_nr_end, map->stripe_len);
4216 stripe_end_offset = stripe_nr_end * map->stripe_len - 4422 stripe_end_offset = stripe_nr_end * map->stripe_len -
4217 (offset + *length); 4423 (offset + *length);
4424
4218 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4425 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4219 if (rw & REQ_DISCARD) 4426 if (rw & REQ_DISCARD)
4220 num_stripes = min_t(u64, map->num_stripes, 4427 num_stripes = min_t(u64, map->num_stripes,
@@ -4265,6 +4472,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4265 dev_replace_is_ongoing); 4472 dev_replace_is_ongoing);
4266 mirror_num = stripe_index - old_stripe_index + 1; 4473 mirror_num = stripe_index - old_stripe_index + 1;
4267 } 4474 }
4475
4476 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4477 BTRFS_BLOCK_GROUP_RAID6)) {
4478 u64 tmp;
4479
4480 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4481 && raid_map_ret) {
4482 int i, rot;
4483
4484 /* push stripe_nr back to the start of the full stripe */
4485 stripe_nr = raid56_full_stripe_start;
4486 do_div(stripe_nr, stripe_len);
4487
4488 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4489
4490 /* RAID[56] write or recovery. Return all stripes */
4491 num_stripes = map->num_stripes;
4492 max_errors = nr_parity_stripes(map);
4493
4494 raid_map = kmalloc(sizeof(u64) * num_stripes,
4495 GFP_NOFS);
4496 if (!raid_map) {
4497 ret = -ENOMEM;
4498 goto out;
4499 }
4500
4501 /* Work out the disk rotation on this stripe-set */
4502 tmp = stripe_nr;
4503 rot = do_div(tmp, num_stripes);
4504
4505 /* Fill in the logical address of each stripe */
4506 tmp = stripe_nr * nr_data_stripes(map);
4507 for (i = 0; i < nr_data_stripes(map); i++)
4508 raid_map[(i+rot) % num_stripes] =
4509 em->start + (tmp + i) * map->stripe_len;
4510
4511 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4512 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4513 raid_map[(i+rot+1) % num_stripes] =
4514 RAID6_Q_STRIPE;
4515
4516 *length = map->stripe_len;
4517 stripe_index = 0;
4518 stripe_offset = 0;
4519 } else {
4520 /*
4521 * Mirror #0 or #1 means the original data block.
4522 * Mirror #2 is RAID5 parity block.
4523 * Mirror #3 is RAID6 Q block.
4524 */
4525 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4526 if (mirror_num > 1)
4527 stripe_index = nr_data_stripes(map) +
4528 mirror_num - 2;
4529
4530 /* We distribute the parity blocks across stripes */
4531 tmp = stripe_nr + stripe_index;
4532 stripe_index = do_div(tmp, map->num_stripes);
4533 }
4268 } else { 4534 } else {
4269 /* 4535 /*
4270 * after this do_div call, stripe_nr is the number of stripes 4536 * after this do_div call, stripe_nr is the number of stripes
@@ -4373,8 +4639,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4373 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4639 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4374 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4640 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4375 BTRFS_BLOCK_GROUP_RAID10 | 4641 BTRFS_BLOCK_GROUP_RAID10 |
4642 BTRFS_BLOCK_GROUP_RAID5 |
4376 BTRFS_BLOCK_GROUP_DUP)) { 4643 BTRFS_BLOCK_GROUP_DUP)) {
4377 max_errors = 1; 4644 max_errors = 1;
4645 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4646 max_errors = 2;
4378 } 4647 }
4379 } 4648 }
4380 4649
@@ -4475,6 +4744,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4475 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4744 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4476 bbio->mirror_num = map->num_stripes + 1; 4745 bbio->mirror_num = map->num_stripes + 1;
4477 } 4746 }
4747 if (raid_map) {
4748 sort_parity_stripes(bbio, raid_map);
4749 *raid_map_ret = raid_map;
4750 }
4478out: 4751out:
4479 if (dev_replace_is_ongoing) 4752 if (dev_replace_is_ongoing)
4480 btrfs_dev_replace_unlock(dev_replace); 4753 btrfs_dev_replace_unlock(dev_replace);
@@ -4487,7 +4760,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4487 struct btrfs_bio **bbio_ret, int mirror_num) 4760 struct btrfs_bio **bbio_ret, int mirror_num)
4488{ 4761{
4489 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4762 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4490 mirror_num); 4763 mirror_num, NULL);
4491} 4764}
4492 4765
4493int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4766int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4501,6 +4774,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4501 u64 bytenr; 4774 u64 bytenr;
4502 u64 length; 4775 u64 length;
4503 u64 stripe_nr; 4776 u64 stripe_nr;
4777 u64 rmap_len;
4504 int i, j, nr = 0; 4778 int i, j, nr = 0;
4505 4779
4506 read_lock(&em_tree->lock); 4780 read_lock(&em_tree->lock);
@@ -4511,10 +4785,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4511 map = (struct map_lookup *)em->bdev; 4785 map = (struct map_lookup *)em->bdev;
4512 4786
4513 length = em->len; 4787 length = em->len;
4788 rmap_len = map->stripe_len;
4789
4514 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4790 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4515 do_div(length, map->num_stripes / map->sub_stripes); 4791 do_div(length, map->num_stripes / map->sub_stripes);
4516 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4792 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4517 do_div(length, map->num_stripes); 4793 do_div(length, map->num_stripes);
4794 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4795 BTRFS_BLOCK_GROUP_RAID6)) {
4796 do_div(length, nr_data_stripes(map));
4797 rmap_len = map->stripe_len * nr_data_stripes(map);
4798 }
4518 4799
4519 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4800 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4520 BUG_ON(!buf); /* -ENOMEM */ 4801 BUG_ON(!buf); /* -ENOMEM */
@@ -4534,8 +4815,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4534 do_div(stripe_nr, map->sub_stripes); 4815 do_div(stripe_nr, map->sub_stripes);
4535 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4816 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4536 stripe_nr = stripe_nr * map->num_stripes + i; 4817 stripe_nr = stripe_nr * map->num_stripes + i;
4537 } 4818 } /* else if RAID[56], multiply by nr_data_stripes().
4538 bytenr = chunk_start + stripe_nr * map->stripe_len; 4819 * Alternatively, just use rmap_len below instead of
4820 * map->stripe_len */
4821
4822 bytenr = chunk_start + stripe_nr * rmap_len;
4539 WARN_ON(nr >= map->num_stripes); 4823 WARN_ON(nr >= map->num_stripes);
4540 for (j = 0; j < nr; j++) { 4824 for (j = 0; j < nr; j++) {
4541 if (buf[j] == bytenr) 4825 if (buf[j] == bytenr)
@@ -4549,7 +4833,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4549 4833
4550 *logical = buf; 4834 *logical = buf;
4551 *naddrs = nr; 4835 *naddrs = nr;
4552 *stripe_len = map->stripe_len; 4836 *stripe_len = rmap_len;
4553 4837
4554 free_extent_map(em); 4838 free_extent_map(em);
4555 return 0; 4839 return 0;
@@ -4623,7 +4907,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4623 bio->bi_bdev = (struct block_device *) 4907 bio->bi_bdev = (struct block_device *)
4624 (unsigned long)bbio->mirror_num; 4908 (unsigned long)bbio->mirror_num;
4625 /* only send an error to the higher layers if it is 4909 /* only send an error to the higher layers if it is
4626 * beyond the tolerance of the multi-bio 4910 * beyond the tolerance of the btrfs bio
4627 */ 4911 */
4628 if (atomic_read(&bbio->error) > bbio->max_errors) { 4912 if (atomic_read(&bbio->error) > bbio->max_errors) {
4629 err = -EIO; 4913 err = -EIO;
@@ -4657,13 +4941,18 @@ struct async_sched {
4657 * This will add one bio to the pending list for a device and make sure 4941 * This will add one bio to the pending list for a device and make sure
4658 * the work struct is scheduled. 4942 * the work struct is scheduled.
4659 */ 4943 */
4660static noinline void schedule_bio(struct btrfs_root *root, 4944noinline void btrfs_schedule_bio(struct btrfs_root *root,
4661 struct btrfs_device *device, 4945 struct btrfs_device *device,
4662 int rw, struct bio *bio) 4946 int rw, struct bio *bio)
4663{ 4947{
4664 int should_queue = 1; 4948 int should_queue = 1;
4665 struct btrfs_pending_bios *pending_bios; 4949 struct btrfs_pending_bios *pending_bios;
4666 4950
4951 if (device->missing || !device->bdev) {
4952 bio_endio(bio, -EIO);
4953 return;
4954 }
4955
4667 /* don't bother with additional async steps for reads, right now */ 4956 /* don't bother with additional async steps for reads, right now */
4668 if (!(rw & REQ_WRITE)) { 4957 if (!(rw & REQ_WRITE)) {
4669 bio_get(bio); 4958 bio_get(bio);
@@ -4761,7 +5050,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4761#endif 5050#endif
4762 bio->bi_bdev = dev->bdev; 5051 bio->bi_bdev = dev->bdev;
4763 if (async) 5052 if (async)
4764 schedule_bio(root, dev, rw, bio); 5053 btrfs_schedule_bio(root, dev, rw, bio);
4765 else 5054 else
4766 btrfsic_submit_bio(rw, bio); 5055 btrfsic_submit_bio(rw, bio);
4767} 5056}
@@ -4820,6 +5109,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4820 u64 logical = (u64)bio->bi_sector << 9; 5109 u64 logical = (u64)bio->bi_sector << 9;
4821 u64 length = 0; 5110 u64 length = 0;
4822 u64 map_length; 5111 u64 map_length;
5112 u64 *raid_map = NULL;
4823 int ret; 5113 int ret;
4824 int dev_nr = 0; 5114 int dev_nr = 0;
4825 int total_devs = 1; 5115 int total_devs = 1;
@@ -4828,12 +5118,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4828 length = bio->bi_size; 5118 length = bio->bi_size;
4829 map_length = length; 5119 map_length = length;
4830 5120
4831 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5121 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4832 mirror_num); 5122 mirror_num, &raid_map);
4833 if (ret) 5123 if (ret) /* -ENOMEM */
4834 return ret; 5124 return ret;
4835 5125
4836 total_devs = bbio->num_stripes; 5126 total_devs = bbio->num_stripes;
5127 bbio->orig_bio = first_bio;
5128 bbio->private = first_bio->bi_private;
5129 bbio->end_io = first_bio->bi_end_io;
5130 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5131
5132 if (raid_map) {
5133 /* In this case, map_length has been set to the length of
5134 a single stripe; not the whole write */
5135 if (rw & WRITE) {
5136 return raid56_parity_write(root, bio, bbio,
5137 raid_map, map_length);
5138 } else {
5139 return raid56_parity_recover(root, bio, bbio,
5140 raid_map, map_length,
5141 mirror_num);
5142 }
5143 }
5144
4837 if (map_length < length) { 5145 if (map_length < length) {
4838 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5146 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4839 "len %llu\n", (unsigned long long)logical, 5147 "len %llu\n", (unsigned long long)logical,
@@ -4842,11 +5150,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4842 BUG(); 5150 BUG();
4843 } 5151 }
4844 5152
4845 bbio->orig_bio = first_bio;
4846 bbio->private = first_bio->bi_private;
4847 bbio->end_io = first_bio->bi_end_io;
4848 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4849
4850 while (dev_nr < total_devs) { 5153 while (dev_nr < total_devs) {
4851 dev = bbio->stripes[dev_nr].dev; 5154 dev = bbio->stripes[dev_nr].dev;
4852 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5155 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..0c2b856ecd98 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{