aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorChris Mason <chris.mason@fusionio.com>2013-02-20 14:06:05 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-20 14:06:05 -0500
commite942f883bc6651d50be139477baf6fb0eed3d5bb (patch)
treee1d19783e9c8b42198a69c17c9719fb90f302847 /fs/btrfs
parentb2c6b3e0611c58fbeb6b9c0892b6249f7bdfaf6b (diff)
parent0e4e02636611dbf89a2f36320a32054f9936d6cb (diff)
Merge branch 'raid56-experimental' into for-linus-3.9
Signed-off-by: Chris Mason <chris.mason@fusionio.com> Conflicts: fs/btrfs/ctree.h fs/btrfs/extent-tree.c fs/btrfs/inode.c fs/btrfs/volumes.c
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig3
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.h44
-rw-r--r--fs/btrfs/delayed-ref.h9
-rw-r--r--fs/btrfs/disk-io.c62
-rw-r--r--fs/btrfs/disk-io.h7
-rw-r--r--fs/btrfs/extent-tree.c156
-rw-r--r--fs/btrfs/extent_io.c40
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/free-space-cache.c50
-rw-r--r--fs/btrfs/inode.c18
-rw-r--r--fs/btrfs/raid56.c2080
-rw-r--r--fs/btrfs/raid56.h51
-rw-r--r--fs/btrfs/scrub.c8
-rw-r--r--fs/btrfs/transaction.c9
-rw-r--r--fs/btrfs/volumes.c380
-rw-r--r--fs/btrfs/volumes.h9
18 files changed, 2814 insertions, 120 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..5f583c8a36d0 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -6,6 +6,9 @@ config BTRFS_FS
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS 7 select LZO_COMPRESS
8 select LZO_DECOMPRESS 8 select LZO_DECOMPRESS
9 select RAID6_PQ
10 select XOR_BLOCKS
11
9 help 12 help
10 Btrfs is a new filesystem with extents, writable snapshotting, 13 Btrfs is a new filesystem with extents, writable snapshotting,
11 support for multiple devices and many more features. 14 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o 11 reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o 14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
372 page = compressed_pages[pg_index]; 372 page = compressed_pages[pg_index];
373 page->mapping = inode->i_mapping; 373 page->mapping = inode->i_mapping;
374 if (bio->bi_size) 374 if (bio->bi_size)
375 ret = io_tree->ops->merge_bio_hook(page, 0, 375 ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
376 PAGE_CACHE_SIZE, 376 PAGE_CACHE_SIZE,
377 bio, 0); 377 bio, 0);
378 else 378 else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
655 page->index = em_start >> PAGE_CACHE_SHIFT; 655 page->index = em_start >> PAGE_CACHE_SHIFT;
656 656
657 if (comp_bio->bi_size) 657 if (comp_bio->bi_size)
658 ret = tree->ops->merge_bio_hook(page, 0, 658 ret = tree->ops->merge_bio_hook(READ, page, 0,
659 PAGE_CACHE_SIZE, 659 PAGE_CACHE_SIZE,
660 comp_bio, 0); 660 comp_bio, 0);
661 else 661 else
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1679051f4d39..3dcedfe4f759 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -506,6 +506,7 @@ struct btrfs_super_block {
506#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) 506#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
507 507
508#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) 508#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
509#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
509 510
510#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 511#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
511#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 512#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
@@ -515,6 +516,7 @@ struct btrfs_super_block {
515 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ 516 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
516 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ 517 BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
517 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ 518 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
519 BTRFS_FEATURE_INCOMPAT_RAID56 | \
518 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) 520 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
519 521
520/* 522/*
@@ -956,6 +958,8 @@ struct btrfs_dev_replace_item {
956#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) 958#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
957#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) 959#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
958#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) 960#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
961#define BTRFS_BLOCK_GROUP_RAID5 (1 << 7)
962#define BTRFS_BLOCK_GROUP_RAID6 (1 << 8)
959#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE 963#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
960 964
961enum btrfs_raid_types { 965enum btrfs_raid_types {
@@ -964,6 +968,8 @@ enum btrfs_raid_types {
964 BTRFS_RAID_DUP, 968 BTRFS_RAID_DUP,
965 BTRFS_RAID_RAID0, 969 BTRFS_RAID_RAID0,
966 BTRFS_RAID_SINGLE, 970 BTRFS_RAID_SINGLE,
971 BTRFS_RAID_RAID5,
972 BTRFS_RAID_RAID6,
967 BTRFS_NR_RAID_TYPES 973 BTRFS_NR_RAID_TYPES
968}; 974};
969 975
@@ -973,6 +979,8 @@ enum btrfs_raid_types {
973 979
974#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 980#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
975 BTRFS_BLOCK_GROUP_RAID1 | \ 981 BTRFS_BLOCK_GROUP_RAID1 | \
982 BTRFS_BLOCK_GROUP_RAID5 | \
983 BTRFS_BLOCK_GROUP_RAID6 | \
976 BTRFS_BLOCK_GROUP_DUP | \ 984 BTRFS_BLOCK_GROUP_DUP | \
977 BTRFS_BLOCK_GROUP_RAID10) 985 BTRFS_BLOCK_GROUP_RAID10)
978/* 986/*
@@ -1197,6 +1205,10 @@ struct btrfs_block_group_cache {
1197 u64 flags; 1205 u64 flags;
1198 u64 sectorsize; 1206 u64 sectorsize;
1199 u64 cache_generation; 1207 u64 cache_generation;
1208
1209 /* for raid56, this is a full stripe, without parity */
1210 unsigned long full_stripe_len;
1211
1200 unsigned int ro:1; 1212 unsigned int ro:1;
1201 unsigned int dirty:1; 1213 unsigned int dirty:1;
1202 unsigned int iref:1; 1214 unsigned int iref:1;
@@ -1242,6 +1254,23 @@ enum btrfs_orphan_cleanup_state {
1242 ORPHAN_CLEANUP_DONE = 2, 1254 ORPHAN_CLEANUP_DONE = 2,
1243}; 1255};
1244 1256
1257/* used by the raid56 code to lock stripes for read/modify/write */
1258struct btrfs_stripe_hash {
1259 struct list_head hash_list;
1260 wait_queue_head_t wait;
1261 spinlock_t lock;
1262};
1263
1264/* used by the raid56 code to lock stripes for read/modify/write */
1265struct btrfs_stripe_hash_table {
1266 struct list_head stripe_cache;
1267 spinlock_t cache_lock;
1268 int cache_size;
1269 struct btrfs_stripe_hash table[];
1270};
1271
1272#define BTRFS_STRIPE_HASH_TABLE_BITS 11
1273
1245/* fs_info */ 1274/* fs_info */
1246struct reloc_control; 1275struct reloc_control;
1247struct btrfs_device; 1276struct btrfs_device;
@@ -1341,6 +1370,13 @@ struct btrfs_fs_info {
1341 struct mutex cleaner_mutex; 1370 struct mutex cleaner_mutex;
1342 struct mutex chunk_mutex; 1371 struct mutex chunk_mutex;
1343 struct mutex volume_mutex; 1372 struct mutex volume_mutex;
1373
1374 /* this is used during read/modify/write to make sure
1375 * no two ios are trying to mod the same stripe at the same
1376 * time
1377 */
1378 struct btrfs_stripe_hash_table *stripe_hash_table;
1379
1344 /* 1380 /*
1345 * this protects the ordered operations list only while we are 1381 * this protects the ordered operations list only while we are
1346 * processing all of the entries on it. This way we make 1382 * processing all of the entries on it. This way we make
@@ -1423,6 +1459,8 @@ struct btrfs_fs_info {
1423 struct btrfs_workers flush_workers; 1459 struct btrfs_workers flush_workers;
1424 struct btrfs_workers endio_workers; 1460 struct btrfs_workers endio_workers;
1425 struct btrfs_workers endio_meta_workers; 1461 struct btrfs_workers endio_meta_workers;
1462 struct btrfs_workers endio_raid56_workers;
1463 struct btrfs_workers rmw_workers;
1426 struct btrfs_workers endio_meta_write_workers; 1464 struct btrfs_workers endio_meta_write_workers;
1427 struct btrfs_workers endio_write_workers; 1465 struct btrfs_workers endio_write_workers;
1428 struct btrfs_workers endio_freespace_worker; 1466 struct btrfs_workers endio_freespace_worker;
@@ -3490,9 +3528,9 @@ int btrfs_writepages(struct address_space *mapping,
3490 struct writeback_control *wbc); 3528 struct writeback_control *wbc);
3491int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 3529int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
3492 struct btrfs_root *new_root, u64 new_dirid); 3530 struct btrfs_root *new_root, u64 new_dirid);
3493int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 3531int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
3494 size_t size, struct bio *bio, unsigned long bio_flags); 3532 size_t size, struct bio *bio,
3495 3533 unsigned long bio_flags);
3496int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 3534int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
3497int btrfs_readpage(struct file *file, struct page *page); 3535int btrfs_readpage(struct file *file, struct page *page);
3498void btrfs_evict_inode(struct inode *inode); 3536void btrfs_evict_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 7939149f8f27..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
132 unsigned long num_heads_ready; 132 unsigned long num_heads_ready;
133 133
134 /* 134 /*
135 * bumped when someone is making progress on the delayed
136 * refs, so that other procs know they are just adding to
137 * contention intead of helping
138 */
139 atomic_t procs_running_refs;
140 atomic_t ref_seq;
141 wait_queue_head_t wait;
142
143 /*
135 * set when the tree is flushing before a transaction commit, 144 * set when the tree is flushing before a transaction commit,
136 * used by the throttling code to decide if new updates need 145 * used by the throttling code to decide if new updates need
137 * to be run right away 146 * to be run right away
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 779b401cd952..eb7c14308521 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -640,8 +641,15 @@ err:
640 btree_readahead_hook(root, eb, eb->start, ret); 641 btree_readahead_hook(root, eb, eb->start, ret);
641 } 642 }
642 643
643 if (ret) 644 if (ret) {
645 /*
646 * our io error hook is going to dec the io pages
647 * again, we have to make sure it has something
648 * to decrement
649 */
650 atomic_inc(&eb->io_pages);
644 clear_extent_buffer_uptodate(eb); 651 clear_extent_buffer_uptodate(eb);
652 }
645 free_extent_buffer(eb); 653 free_extent_buffer(eb);
646out: 654out:
647 return ret; 655 return ret;
@@ -655,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
655 eb = (struct extent_buffer *)page->private; 663 eb = (struct extent_buffer *)page->private;
656 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 664 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
657 eb->read_mirror = failed_mirror; 665 eb->read_mirror = failed_mirror;
666 atomic_dec(&eb->io_pages);
658 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 667 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
659 btree_readahead_hook(root, eb, eb->start, -EIO); 668 btree_readahead_hook(root, eb, eb->start, -EIO);
660 return -EIO; /* we fixed nothing */ 669 return -EIO; /* we fixed nothing */
@@ -671,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
671 end_io_wq->work.flags = 0; 680 end_io_wq->work.flags = 0;
672 681
673 if (bio->bi_rw & REQ_WRITE) { 682 if (bio->bi_rw & REQ_WRITE) {
674 if (end_io_wq->metadata == 1) 683 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
675 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 684 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
676 &end_io_wq->work); 685 &end_io_wq->work);
677 else if (end_io_wq->metadata == 2) 686 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
678 btrfs_queue_worker(&fs_info->endio_freespace_worker, 687 btrfs_queue_worker(&fs_info->endio_freespace_worker,
679 &end_io_wq->work); 688 &end_io_wq->work);
689 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
690 btrfs_queue_worker(&fs_info->endio_raid56_workers,
691 &end_io_wq->work);
680 else 692 else
681 btrfs_queue_worker(&fs_info->endio_write_workers, 693 btrfs_queue_worker(&fs_info->endio_write_workers,
682 &end_io_wq->work); 694 &end_io_wq->work);
683 } else { 695 } else {
684 if (end_io_wq->metadata) 696 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
697 btrfs_queue_worker(&fs_info->endio_raid56_workers,
698 &end_io_wq->work);
699 else if (end_io_wq->metadata)
685 btrfs_queue_worker(&fs_info->endio_meta_workers, 700 btrfs_queue_worker(&fs_info->endio_meta_workers,
686 &end_io_wq->work); 701 &end_io_wq->work);
687 else 702 else
@@ -696,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
696 * 0 - if data 711 * 0 - if data
697 * 1 - if normal metadta 712 * 1 - if normal metadta
698 * 2 - if writing to the free space cache area 713 * 2 - if writing to the free space cache area
714 * 3 - raid parity work
699 */ 715 */
700int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 716int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
701 int metadata) 717 int metadata)
@@ -2179,6 +2195,12 @@ int open_ctree(struct super_block *sb,
2179 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2195 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2180 init_waitqueue_head(&fs_info->async_submit_wait); 2196 init_waitqueue_head(&fs_info->async_submit_wait);
2181 2197
2198 ret = btrfs_alloc_stripe_hash_table(fs_info);
2199 if (ret) {
2200 err = -ENOMEM;
2201 goto fail_alloc;
2202 }
2203
2182 __setup_root(4096, 4096, 4096, 4096, tree_root, 2204 __setup_root(4096, 4096, 4096, 4096, tree_root,
2183 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2205 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2184 2206
@@ -2349,6 +2371,12 @@ int open_ctree(struct super_block *sb,
2349 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2371 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2350 "endio-meta-write", fs_info->thread_pool_size, 2372 "endio-meta-write", fs_info->thread_pool_size,
2351 &fs_info->generic_worker); 2373 &fs_info->generic_worker);
2374 btrfs_init_workers(&fs_info->endio_raid56_workers,
2375 "endio-raid56", fs_info->thread_pool_size,
2376 &fs_info->generic_worker);
2377 btrfs_init_workers(&fs_info->rmw_workers,
2378 "rmw", fs_info->thread_pool_size,
2379 &fs_info->generic_worker);
2352 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2380 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2353 fs_info->thread_pool_size, 2381 fs_info->thread_pool_size,
2354 &fs_info->generic_worker); 2382 &fs_info->generic_worker);
@@ -2367,6 +2395,8 @@ int open_ctree(struct super_block *sb,
2367 */ 2395 */
2368 fs_info->endio_workers.idle_thresh = 4; 2396 fs_info->endio_workers.idle_thresh = 4;
2369 fs_info->endio_meta_workers.idle_thresh = 4; 2397 fs_info->endio_meta_workers.idle_thresh = 4;
2398 fs_info->endio_raid56_workers.idle_thresh = 4;
2399 fs_info->rmw_workers.idle_thresh = 2;
2370 2400
2371 fs_info->endio_write_workers.idle_thresh = 2; 2401 fs_info->endio_write_workers.idle_thresh = 2;
2372 fs_info->endio_meta_write_workers.idle_thresh = 2; 2402 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2383,6 +2413,8 @@ int open_ctree(struct super_block *sb,
2383 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2413 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2384 ret |= btrfs_start_workers(&fs_info->endio_workers); 2414 ret |= btrfs_start_workers(&fs_info->endio_workers);
2385 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2415 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2416 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2417 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2386 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2418 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2387 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2419 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2388 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2420 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2726,6 +2758,8 @@ fail_sb_buffer:
2726 btrfs_stop_workers(&fs_info->workers); 2758 btrfs_stop_workers(&fs_info->workers);
2727 btrfs_stop_workers(&fs_info->endio_workers); 2759 btrfs_stop_workers(&fs_info->endio_workers);
2728 btrfs_stop_workers(&fs_info->endio_meta_workers); 2760 btrfs_stop_workers(&fs_info->endio_meta_workers);
2761 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2762 btrfs_stop_workers(&fs_info->rmw_workers);
2729 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2763 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2730 btrfs_stop_workers(&fs_info->endio_write_workers); 2764 btrfs_stop_workers(&fs_info->endio_write_workers);
2731 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2765 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2747,6 +2781,7 @@ fail_bdi:
2747fail_srcu: 2781fail_srcu:
2748 cleanup_srcu_struct(&fs_info->subvol_srcu); 2782 cleanup_srcu_struct(&fs_info->subvol_srcu);
2749fail: 2783fail:
2784 btrfs_free_stripe_hash_table(fs_info);
2750 btrfs_close_devices(fs_info->fs_devices); 2785 btrfs_close_devices(fs_info->fs_devices);
2751 return err; 2786 return err;
2752 2787
@@ -3094,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3094 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3129 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3095 == 0))) 3130 == 0)))
3096 num_tolerated_disk_barrier_failures = 0; 3131 num_tolerated_disk_barrier_failures = 0;
3097 else if (num_tolerated_disk_barrier_failures > 1 3132 else if (num_tolerated_disk_barrier_failures > 1) {
3098 && 3133 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3099 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3134 BTRFS_BLOCK_GROUP_RAID5 |
3100 BTRFS_BLOCK_GROUP_RAID10))) 3135 BTRFS_BLOCK_GROUP_RAID10)) {
3101 num_tolerated_disk_barrier_failures = 1; 3136 num_tolerated_disk_barrier_failures = 1;
3137 } else if (flags &
3138 BTRFS_BLOCK_GROUP_RAID5) {
3139 num_tolerated_disk_barrier_failures = 2;
3140 }
3141 }
3102 } 3142 }
3103 } 3143 }
3104 up_read(&sinfo->groups_sem); 3144 up_read(&sinfo->groups_sem);
@@ -3402,6 +3442,8 @@ int close_ctree(struct btrfs_root *root)
3402 btrfs_stop_workers(&fs_info->workers); 3442 btrfs_stop_workers(&fs_info->workers);
3403 btrfs_stop_workers(&fs_info->endio_workers); 3443 btrfs_stop_workers(&fs_info->endio_workers);
3404 btrfs_stop_workers(&fs_info->endio_meta_workers); 3444 btrfs_stop_workers(&fs_info->endio_meta_workers);
3445 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3446 btrfs_stop_workers(&fs_info->rmw_workers);
3405 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3447 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3406 btrfs_stop_workers(&fs_info->endio_write_workers); 3448 btrfs_stop_workers(&fs_info->endio_write_workers);
3407 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3449 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3424,6 +3466,8 @@ int close_ctree(struct btrfs_root *root)
3424 bdi_destroy(&fs_info->bdi); 3466 bdi_destroy(&fs_info->bdi);
3425 cleanup_srcu_struct(&fs_info->subvol_srcu); 3467 cleanup_srcu_struct(&fs_info->subvol_srcu);
3426 3468
3469 btrfs_free_stripe_hash_table(fs_info);
3470
3427 return 0; 3471 return 0;
3428} 3472}
3429 3473
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum {
29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3,
33};
34
28static inline u64 btrfs_sb_offset(int mirror) 35static inline u64 btrfs_sb_offset(int mirror)
29{ 36{
30 u64 start = 16 * 1024; 37 u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5cd44e239595..b3ecca447ddf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2440,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2440 return ret; 2443 return ret;
2441} 2444}
2442 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2443/* 2456/*
2444 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2445 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2474,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2474 2487
2475 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2476 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2477again: 2528again:
2478 loops = 0; 2529 loops = 0;
2479 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2482,10 +2533,6 @@ again:
2482 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2483#endif 2534#endif
2484 2535
2485 if (count == 0) {
2486 count = delayed_refs->num_entries * 2;
2487 run_most = 1;
2488 }
2489 while (1) { 2536 while (1) {
2490 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2491 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2508,9 +2555,12 @@ again:
2508 btrfs_release_ref_cluster(&cluster); 2555 btrfs_release_ref_cluster(&cluster);
2509 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2510 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2511 return ret; 2559 return ret;
2512 } 2560 }
2513 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2514 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2515 2565
2516 if (count == 0) 2566 if (count == 0)
@@ -2579,6 +2629,11 @@ again:
2579 goto again; 2629 goto again;
2580 } 2630 }
2581out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2582 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2583 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2584 return 0; 2639 return 0;
@@ -3284,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3284 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3285 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3286 u64 target; 3341 u64 target;
3342 u64 tmp;
3287 3343
3288 /* 3344 /*
3289 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3300,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3300 } 3356 }
3301 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3302 3358
3359 /* First, mask out the RAID levels which aren't possible */
3303 if (num_devices == 1) 3360 if (num_devices == 1)
3304 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3305 if (num_devices < 4) 3365 if (num_devices < 4)
3306 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3307 3367
3308 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3309 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3310 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3311 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3312 }
3313 3372
3314 if ((flags & BTRFS_BLOCK_GROUP_RAID1) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3315 (flags & BTRFS_BLOCK_GROUP_RAID10)) { 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3316 flags &= ~BTRFS_BLOCK_GROUP_RAID1; 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3317 } 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3318 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3319 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3320 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3321 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3322 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3323 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3324 }
3325 3383
3326 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3327} 3385}
3328 3386
3329static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3347,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3347u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3348{ 3406{
3349 u64 flags; 3407 u64 flags;
3408 u64 ret;
3350 3409
3351 if (data) 3410 if (data)
3352 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3355,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3355 else 3414 else
3356 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3357 3416
3358 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3359} 3419}
3360 3420
3361/* 3421/*
@@ -3530,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3530{ 3590{
3531 u64 num_dev; 3591 u64 num_dev;
3532 3592
3533 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3534 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3535 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3536 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3537 num_dev = 2; 3599 num_dev = 2;
@@ -3706,7 +3768,9 @@ static int can_overcommit(struct btrfs_root *root,
3706 3768
3707 /* 3769 /*
3708 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3709 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3710 */ 3774 */
3711 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3712 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
@@ -5539,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5539 return ret; 5603 return ret;
5540} 5604}
5541 5605
5542static u64 stripe_align(struct btrfs_root *root, u64 val) 5606static u64 stripe_align(struct btrfs_root *root,
5607 struct btrfs_block_group_cache *cache,
5608 u64 val, u64 num_bytes)
5543{ 5609{
5544 u64 mask = ((u64)root->stripesize - 1); 5610 u64 mask;
5545 u64 ret = (val + mask) & ~mask; 5611 u64 ret;
5612 mask = ((u64)root->stripesize - 1);
5613 ret = (val + mask) & ~mask;
5546 return ret; 5614 return ret;
5547} 5615}
5548 5616
@@ -5599,8 +5667,12 @@ int __get_raid_index(u64 flags)
5599 return BTRFS_RAID_DUP; 5667 return BTRFS_RAID_DUP;
5600 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5668 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5601 return BTRFS_RAID_RAID0; 5669 return BTRFS_RAID_RAID0;
5602 else 5670 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5603 return BTRFS_RAID_SINGLE; 5671 return BTRFS_RAID_RAID5;
5672 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5673 return BTRFS_RAID_RAID6;
5674
5675 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5604} 5676}
5605 5677
5606static int get_block_group_index(struct btrfs_block_group_cache *cache) 5678static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5743,6 +5815,8 @@ search:
5743 if (!block_group_bits(block_group, data)) { 5815 if (!block_group_bits(block_group, data)) {
5744 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5816 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5745 BTRFS_BLOCK_GROUP_RAID1 | 5817 BTRFS_BLOCK_GROUP_RAID1 |
5818 BTRFS_BLOCK_GROUP_RAID5 |
5819 BTRFS_BLOCK_GROUP_RAID6 |
5746 BTRFS_BLOCK_GROUP_RAID10; 5820 BTRFS_BLOCK_GROUP_RAID10;
5747 5821
5748 /* 5822 /*
@@ -5771,6 +5845,7 @@ have_block_group:
5771 * lets look there 5845 * lets look there
5772 */ 5846 */
5773 if (last_ptr) { 5847 if (last_ptr) {
5848 unsigned long aligned_cluster;
5774 /* 5849 /*
5775 * the refill lock keeps out other 5850 * the refill lock keeps out other
5776 * people trying to start a new cluster 5851 * people trying to start a new cluster
@@ -5837,11 +5912,15 @@ refill_cluster:
5837 goto unclustered_alloc; 5912 goto unclustered_alloc;
5838 } 5913 }
5839 5914
5915 aligned_cluster = max_t(unsigned long,
5916 empty_cluster + empty_size,
5917 block_group->full_stripe_len);
5918
5840 /* allocate a cluster in this block group */ 5919 /* allocate a cluster in this block group */
5841 ret = btrfs_find_space_cluster(trans, root, 5920 ret = btrfs_find_space_cluster(trans, root,
5842 block_group, last_ptr, 5921 block_group, last_ptr,
5843 search_start, num_bytes, 5922 search_start, num_bytes,
5844 empty_cluster + empty_size); 5923 aligned_cluster);
5845 if (ret == 0) { 5924 if (ret == 0) {
5846 /* 5925 /*
5847 * now pull our allocation out of this 5926 * now pull our allocation out of this
@@ -5912,7 +5991,8 @@ unclustered_alloc:
5912 goto loop; 5991 goto loop;
5913 } 5992 }
5914checks: 5993checks:
5915 search_start = stripe_align(root, offset); 5994 search_start = stripe_align(root, used_block_group,
5995 offset, num_bytes);
5916 5996
5917 /* move on to the next group */ 5997 /* move on to the next group */
5918 if (search_start + num_bytes > 5998 if (search_start + num_bytes >
@@ -7284,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7284 root->fs_info->fs_devices->missing_devices; 7364 root->fs_info->fs_devices->missing_devices;
7285 7365
7286 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7366 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7367 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7287 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7368 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7288 7369
7289 if (num_devices == 1) { 7370 if (num_devices == 1) {
@@ -7837,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7837 btrfs_release_path(path); 7918 btrfs_release_path(path);
7838 cache->flags = btrfs_block_group_flags(&cache->item); 7919 cache->flags = btrfs_block_group_flags(&cache->item);
7839 cache->sectorsize = root->sectorsize; 7920 cache->sectorsize = root->sectorsize;
7840 7921 cache->full_stripe_len = btrfs_full_stripe_len(root,
7922 &root->fs_info->mapping_tree,
7923 found_key.objectid);
7841 btrfs_init_free_space_ctl(cache); 7924 btrfs_init_free_space_ctl(cache);
7842 7925
7843 /* 7926 /*
@@ -7891,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7891 if (!(get_alloc_profile(root, space_info->flags) & 7974 if (!(get_alloc_profile(root, space_info->flags) &
7892 (BTRFS_BLOCK_GROUP_RAID10 | 7975 (BTRFS_BLOCK_GROUP_RAID10 |
7893 BTRFS_BLOCK_GROUP_RAID1 | 7976 BTRFS_BLOCK_GROUP_RAID1 |
7977 BTRFS_BLOCK_GROUP_RAID5 |
7978 BTRFS_BLOCK_GROUP_RAID6 |
7894 BTRFS_BLOCK_GROUP_DUP))) 7979 BTRFS_BLOCK_GROUP_DUP)))
7895 continue; 7980 continue;
7896 /* 7981 /*
@@ -7966,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7966 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8051 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7967 cache->sectorsize = root->sectorsize; 8052 cache->sectorsize = root->sectorsize;
7968 cache->fs_info = root->fs_info; 8053 cache->fs_info = root->fs_info;
8054 cache->full_stripe_len = btrfs_full_stripe_len(root,
8055 &root->fs_info->mapping_tree,
8056 chunk_offset);
7969 8057
7970 atomic_set(&cache->count, 1); 8058 atomic_set(&cache->count, 1);
7971 spin_lock_init(&cache->lock); 8059 spin_lock_init(&cache->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5c00d6aeae75..66f999b97cbb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1895 if (ret) 1895 if (ret)
1896 err = ret; 1896 err = ret;
1897 1897
1898 if (did_repair) { 1898 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1899 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, 1899 rec->start + rec->len - 1,
1900 rec->start + rec->len - 1, 1900 EXTENT_DAMAGED, GFP_NOFS);
1901 EXTENT_DAMAGED, GFP_NOFS); 1901 if (ret && !err)
1902 if (ret && !err) 1902 err = ret;
1903 err = ret;
1904 }
1905 1903
1906 kfree(rec); 1904 kfree(rec);
1907 return err; 1905 return err;
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1932 u64 map_length = 0; 1930 u64 map_length = 0;
1933 u64 sector; 1931 u64 sector;
1934 struct btrfs_bio *bbio = NULL; 1932 struct btrfs_bio *bbio = NULL;
1933 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1935 int ret; 1934 int ret;
1936 1935
1937 BUG_ON(!mirror_num); 1936 BUG_ON(!mirror_num);
1938 1937
1938 /* we can't repair anything in raid56 yet */
1939 if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1940 return 0;
1941
1939 bio = bio_alloc(GFP_NOFS, 1); 1942 bio = bio_alloc(GFP_NOFS, 1);
1940 if (!bio) 1943 if (!bio)
1941 return -EIO; 1944 return -EIO;
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page)
2052 failrec->failed_mirror); 2055 failrec->failed_mirror);
2053 did_repair = !ret; 2056 did_repair = !ret;
2054 } 2057 }
2058 ret = 0;
2055 } 2059 }
2056 2060
2057out: 2061out:
@@ -2487,13 +2491,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
2487 return ret; 2491 return ret;
2488} 2492}
2489 2493
2490static int merge_bio(struct extent_io_tree *tree, struct page *page, 2494static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2491 unsigned long offset, size_t size, struct bio *bio, 2495 unsigned long offset, size_t size, struct bio *bio,
2492 unsigned long bio_flags) 2496 unsigned long bio_flags)
2493{ 2497{
2494 int ret = 0; 2498 int ret = 0;
2495 if (tree->ops && tree->ops->merge_bio_hook) 2499 if (tree->ops && tree->ops->merge_bio_hook)
2496 ret = tree->ops->merge_bio_hook(page, offset, size, bio, 2500 ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2497 bio_flags); 2501 bio_flags);
2498 BUG_ON(ret < 0); 2502 BUG_ON(ret < 0);
2499 return ret; 2503 return ret;
@@ -2528,7 +2532,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2528 sector; 2532 sector;
2529 2533
2530 if (prev_bio_flags != bio_flags || !contig || 2534 if (prev_bio_flags != bio_flags || !contig ||
2531 merge_bio(tree, page, offset, page_size, bio, bio_flags) || 2535 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2532 bio_add_page(bio, page, page_size, offset) < page_size) { 2536 bio_add_page(bio, page, page_size, offset) < page_size) {
2533 ret = submit_one_bio(rw, bio, mirror_num, 2537 ret = submit_one_bio(rw, bio, mirror_num,
2534 prev_bio_flags); 2538 prev_bio_flags);
@@ -4162,6 +4166,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4162 4166
4163static void check_buffer_tree_ref(struct extent_buffer *eb) 4167static void check_buffer_tree_ref(struct extent_buffer *eb)
4164{ 4168{
4169 int refs;
4165 /* the ref bit is tricky. We have to make sure it is set 4170 /* the ref bit is tricky. We have to make sure it is set
4166 * if we have the buffer dirty. Otherwise the 4171 * if we have the buffer dirty. Otherwise the
4167 * code to free a buffer can end up dropping a dirty 4172 * code to free a buffer can end up dropping a dirty
@@ -4182,6 +4187,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
4182 * So bump the ref count first, then set the bit. If someone 4187 * So bump the ref count first, then set the bit. If someone
4183 * beat us to it, drop the ref we added. 4188 * beat us to it, drop the ref we added.
4184 */ 4189 */
4190 refs = atomic_read(&eb->refs);
4191 if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4192 return;
4193
4185 spin_lock(&eb->refs_lock); 4194 spin_lock(&eb->refs_lock);
4186 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) 4195 if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4187 atomic_inc(&eb->refs); 4196 atomic_inc(&eb->refs);
@@ -4383,9 +4392,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4383 4392
4384void free_extent_buffer(struct extent_buffer *eb) 4393void free_extent_buffer(struct extent_buffer *eb)
4385{ 4394{
4395 int refs;
4396 int old;
4386 if (!eb) 4397 if (!eb)
4387 return; 4398 return;
4388 4399
4400 while (1) {
4401 refs = atomic_read(&eb->refs);
4402 if (refs <= 3)
4403 break;
4404 old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4405 if (old == refs)
4406 return;
4407 }
4408
4389 spin_lock(&eb->refs_lock); 4409 spin_lock(&eb->refs_lock);
4390 if (atomic_read(&eb->refs) == 2 && 4410 if (atomic_read(&eb->refs) == 2 &&
4391 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) 4411 test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ff182322d112..dc81868d975a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,7 +72,7 @@ struct extent_io_ops {
72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 72 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end); 73 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
74 extent_submit_bio_hook_t *submit_bio_hook; 74 extent_submit_bio_hook_t *submit_bio_hook;
75 int (*merge_bio_hook)(struct page *page, unsigned long offset, 75 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
76 size_t size, struct bio *bio, 76 size_t size, struct bio *bio,
77 unsigned long bio_flags); 77 unsigned long bio_flags);
78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); 78 int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index c8090f18c217..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1465,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1465} 1465}
1466 1466
1467static struct btrfs_free_space * 1467static struct btrfs_free_space *
1468find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) 1468find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1469 unsigned long align)
1469{ 1470{
1470 struct btrfs_free_space *entry; 1471 struct btrfs_free_space *entry;
1471 struct rb_node *node; 1472 struct rb_node *node;
1473 u64 ctl_off;
1474 u64 tmp;
1475 u64 align_off;
1472 int ret; 1476 int ret;
1473 1477
1474 if (!ctl->free_space_offset.rb_node) 1478 if (!ctl->free_space_offset.rb_node)
@@ -1483,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
1483 if (entry->bytes < *bytes) 1487 if (entry->bytes < *bytes)
1484 continue; 1488 continue;
1485 1489
1490 /* make sure the space returned is big enough
1491 * to match our requested alignment
1492 */
1493 if (*bytes >= align) {
1494 ctl_off = entry->offset - ctl->start;
1495 tmp = ctl_off + align - 1;;
1496 do_div(tmp, align);
1497 tmp = tmp * align + ctl->start;
1498 align_off = tmp - entry->offset;
1499 } else {
1500 align_off = 0;
1501 tmp = entry->offset;
1502 }
1503
1504 if (entry->bytes < *bytes + align_off)
1505 continue;
1506
1486 if (entry->bitmap) { 1507 if (entry->bitmap) {
1487 ret = search_bitmap(ctl, entry, offset, bytes); 1508 ret = search_bitmap(ctl, entry, &tmp, bytes);
1488 if (!ret) 1509 if (!ret) {
1510 *offset = tmp;
1489 return entry; 1511 return entry;
1512 }
1490 continue; 1513 continue;
1491 } 1514 }
1492 1515
1493 *offset = entry->offset; 1516 *offset = tmp;
1494 *bytes = entry->bytes; 1517 *bytes = entry->bytes - align_off;
1495 return entry; 1518 return entry;
1496 } 1519 }
1497 1520
@@ -2101,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2101 struct btrfs_free_space *entry = NULL; 2124 struct btrfs_free_space *entry = NULL;
2102 u64 bytes_search = bytes + empty_size; 2125 u64 bytes_search = bytes + empty_size;
2103 u64 ret = 0; 2126 u64 ret = 0;
2127 u64 align_gap = 0;
2128 u64 align_gap_len = 0;
2104 2129
2105 spin_lock(&ctl->tree_lock); 2130 spin_lock(&ctl->tree_lock);
2106 entry = find_free_space(ctl, &offset, &bytes_search); 2131 entry = find_free_space(ctl, &offset, &bytes_search,
2132 block_group->full_stripe_len);
2107 if (!entry) 2133 if (!entry)
2108 goto out; 2134 goto out;
2109 2135
@@ -2113,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2113 if (!entry->bytes) 2139 if (!entry->bytes)
2114 free_bitmap(ctl, entry); 2140 free_bitmap(ctl, entry);
2115 } else { 2141 } else {
2142
2116 unlink_free_space(ctl, entry); 2143 unlink_free_space(ctl, entry);
2117 entry->offset += bytes; 2144 align_gap_len = offset - entry->offset;
2118 entry->bytes -= bytes; 2145 align_gap = entry->offset;
2146
2147 entry->offset = offset + bytes;
2148 WARN_ON(entry->bytes < bytes + align_gap_len);
2149
2150 entry->bytes -= bytes + align_gap_len;
2119 if (!entry->bytes) 2151 if (!entry->bytes)
2120 kmem_cache_free(btrfs_free_space_cachep, entry); 2152 kmem_cache_free(btrfs_free_space_cachep, entry);
2121 else 2153 else
@@ -2125,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2125out: 2157out:
2126 spin_unlock(&ctl->tree_lock); 2158 spin_unlock(&ctl->tree_lock);
2127 2159
2160 if (align_gap_len)
2161 __btrfs_add_free_space(ctl, align_gap, align_gap_len);
2128 return ret; 2162 return ret;
2129} 2163}
2130 2164
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1aa98be54ce0..4e6a11c2cfdd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -40,6 +40,7 @@
40#include <linux/ratelimit.h> 40#include <linux/ratelimit.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/btrfs.h> 42#include <linux/btrfs.h>
43#include <linux/blkdev.h>
43#include "compat.h" 44#include "compat.h"
44#include "ctree.h" 45#include "ctree.h"
45#include "disk-io.h" 46#include "disk-io.h"
@@ -1605,7 +1606,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1605 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure 1606 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1606 * we don't create bios that span stripes or chunks 1607 * we don't create bios that span stripes or chunks
1607 */ 1608 */
1608int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 1609int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1609 size_t size, struct bio *bio, 1610 size_t size, struct bio *bio,
1610 unsigned long bio_flags) 1611 unsigned long bio_flags)
1611{ 1612{
@@ -1620,7 +1621,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1620 1621
1621 length = bio->bi_size; 1622 length = bio->bi_size;
1622 map_length = length; 1623 map_length = length;
1623 ret = btrfs_map_block(root->fs_info, READ, logical, 1624 ret = btrfs_map_block(root->fs_info, rw, logical,
1624 &map_length, NULL, 0); 1625 &map_length, NULL, 0);
1625 /* Will always return 0 with map_multi == NULL */ 1626 /* Will always return 0 with map_multi == NULL */
1626 BUG_ON(ret < 0); 1627 BUG_ON(ret < 0);
@@ -6464,19 +6465,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6464 int async_submit = 0; 6465 int async_submit = 0;
6465 6466
6466 map_length = orig_bio->bi_size; 6467 map_length = orig_bio->bi_size;
6467 ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, 6468 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6468 &map_length, NULL, 0); 6469 &map_length, NULL, 0);
6469 if (ret) { 6470 if (ret) {
6470 bio_put(orig_bio); 6471 bio_put(orig_bio);
6471 return -EIO; 6472 return -EIO;
6472 } 6473 }
6473
6474 if (map_length >= orig_bio->bi_size) { 6474 if (map_length >= orig_bio->bi_size) {
6475 bio = orig_bio; 6475 bio = orig_bio;
6476 goto submit; 6476 goto submit;
6477 } 6477 }
6478 6478
6479 async_submit = 1; 6479 /* async crcs make it difficult to collect full stripe writes. */
6480 if (btrfs_get_alloc_profile(root, 1) &
6481 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
6482 async_submit = 0;
6483 else
6484 async_submit = 1;
6485
6480 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6486 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6481 if (!bio) 6487 if (!bio)
6482 return -ENOMEM; 6488 return -ENOMEM;
@@ -6518,7 +6524,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6518 bio->bi_end_io = btrfs_end_dio_bio; 6524 bio->bi_end_io = btrfs_end_dio_bio;
6519 6525
6520 map_length = orig_bio->bi_size; 6526 map_length = orig_bio->bi_size;
6521 ret = btrfs_map_block(root->fs_info, READ, 6527 ret = btrfs_map_block(root->fs_info, rw,
6522 start_sector << 9, 6528 start_sector << 9,
6523 &map_length, NULL, 0); 6529 &map_length, NULL, 0);
6524 if (ret) { 6530 if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..e34e568534d9
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2080 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19#include <linux/sched.h>
20#include <linux/wait.h>
21#include <linux/bio.h>
22#include <linux/slab.h>
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/random.h>
26#include <linux/iocontext.h>
27#include <linux/capability.h>
28#include <linux/ratelimit.h>
29#include <linux/kthread.h>
30#include <linux/raid/pq.h>
31#include <linux/hash.h>
32#include <linux/list_sort.h>
33#include <linux/raid/xor.h>
34#include <asm/div64.h>
35#include "compat.h"
36#include "ctree.h"
37#include "extent_map.h"
38#include "disk-io.h"
39#include "transaction.h"
40#include "print-tree.h"
41#include "volumes.h"
42#include "raid56.h"
43#include "async-thread.h"
44#include "check-integrity.h"
45#include "rcu-string.h"
46
47/* set when additional merges to this rbio are not allowed */
48#define RBIO_RMW_LOCKED_BIT 1
49
50/*
51 * set when this rbio is sitting in the hash, but it is just a cache
52 * of past RMW
53 */
54#define RBIO_CACHE_BIT 2
55
56/*
57 * set when it is safe to trust the stripe_pages for caching
58 */
59#define RBIO_CACHE_READY_BIT 3
60
61
62#define RBIO_CACHE_SIZE 1024
63
64struct btrfs_raid_bio {
65 struct btrfs_fs_info *fs_info;
66 struct btrfs_bio *bbio;
67
68 /*
69 * logical block numbers for the start of each stripe
70 * The last one or two are p/q. These are sorted,
71 * so raid_map[0] is the start of our full stripe
72 */
73 u64 *raid_map;
74
75 /* while we're doing rmw on a stripe
76 * we put it into a hash table so we can
77 * lock the stripe and merge more rbios
78 * into it.
79 */
80 struct list_head hash_list;
81
82 /*
83 * LRU list for the stripe cache
84 */
85 struct list_head stripe_cache;
86
87 /*
88 * for scheduling work in the helper threads
89 */
90 struct btrfs_work work;
91
92 /*
93 * bio list and bio_list_lock are used
94 * to add more bios into the stripe
95 * in hopes of avoiding the full rmw
96 */
97 struct bio_list bio_list;
98 spinlock_t bio_list_lock;
99
100 /* also protected by the bio_list_lock, the
101 * plug list is used by the plugging code
102 * to collect partial bios while plugged. The
103 * stripe locking code also uses it to hand off
104 * the stripe lock to the next pending IO
105 */
106 struct list_head plug_list;
107
108 /*
109 * flags that tell us if it is safe to
110 * merge with this bio
111 */
112 unsigned long flags;
113
114 /* size of each individual stripe on disk */
115 int stripe_len;
116
117 /* number of data stripes (no p/q) */
118 int nr_data;
119
120 /*
121 * set if we're doing a parity rebuild
122 * for a read from higher up, which is handled
123 * differently from a parity rebuild as part of
124 * rmw
125 */
126 int read_rebuild;
127
128 /* first bad stripe */
129 int faila;
130
131 /* second bad stripe (for raid6 use) */
132 int failb;
133
134 /*
135 * number of pages needed to represent the full
136 * stripe
137 */
138 int nr_pages;
139
140 /*
141 * size of all the bios in the bio_list. This
142 * helps us decide if the rbio maps to a full
143 * stripe or not
144 */
145 int bio_list_bytes;
146
147 atomic_t refs;
148
149 /*
150 * these are two arrays of pointers. We allocate the
151 * rbio big enough to hold them both and setup their
152 * locations when the rbio is allocated
153 */
154
155 /* pointers to pages that we allocated for
156 * reading/writing stripes directly from the disk (including P/Q)
157 */
158 struct page **stripe_pages;
159
160 /*
161 * pointers to the pages in the bio_list. Stored
162 * here for faster lookup
163 */
164 struct page **bio_pages;
165};
166
167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
168static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
169static void rmw_work(struct btrfs_work *work);
170static void read_rebuild_work(struct btrfs_work *work);
171static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
172static void async_read_rebuild(struct btrfs_raid_bio *rbio);
173static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
174static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
175static void __free_raid_bio(struct btrfs_raid_bio *rbio);
176static void index_rbio_pages(struct btrfs_raid_bio *rbio);
177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
178
179/*
180 * the stripe hash table is used for locking, and to collect
181 * bios in hopes of making a full stripe
182 */
183int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
184{
185 struct btrfs_stripe_hash_table *table;
186 struct btrfs_stripe_hash_table *x;
187 struct btrfs_stripe_hash *cur;
188 struct btrfs_stripe_hash *h;
189 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
190 int i;
191
192 if (info->stripe_hash_table)
193 return 0;
194
195 table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
196 if (!table)
197 return -ENOMEM;
198
199 spin_lock_init(&table->cache_lock);
200 INIT_LIST_HEAD(&table->stripe_cache);
201
202 h = table->table;
203
204 for (i = 0; i < num_entries; i++) {
205 cur = h + i;
206 INIT_LIST_HEAD(&cur->hash_list);
207 spin_lock_init(&cur->lock);
208 init_waitqueue_head(&cur->wait);
209 }
210
211 x = cmpxchg(&info->stripe_hash_table, NULL, table);
212 if (x)
213 kfree(x);
214 return 0;
215}
216
217/*
218 * caching an rbio means to copy anything from the
219 * bio_pages array into the stripe_pages array. We
220 * use the page uptodate bit in the stripe cache array
221 * to indicate if it has valid data
222 *
223 * once the caching is done, we set the cache ready
224 * bit.
225 */
226static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
227{
228 int i;
229 char *s;
230 char *d;
231 int ret;
232
233 ret = alloc_rbio_pages(rbio);
234 if (ret)
235 return;
236
237 for (i = 0; i < rbio->nr_pages; i++) {
238 if (!rbio->bio_pages[i])
239 continue;
240
241 s = kmap(rbio->bio_pages[i]);
242 d = kmap(rbio->stripe_pages[i]);
243
244 memcpy(d, s, PAGE_CACHE_SIZE);
245
246 kunmap(rbio->bio_pages[i]);
247 kunmap(rbio->stripe_pages[i]);
248 SetPageUptodate(rbio->stripe_pages[i]);
249 }
250 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
251}
252
253/*
254 * we hash on the first logical address of the stripe
255 */
256static int rbio_bucket(struct btrfs_raid_bio *rbio)
257{
258 u64 num = rbio->raid_map[0];
259
260 /*
261 * we shift down quite a bit. We're using byte
262 * addressing, and most of the lower bits are zeros.
263 * This tends to upset hash_64, and it consistently
264 * returns just one or two different values.
265 *
266 * shifting off the lower bits fixes things.
267 */
268 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
269}
270
271/*
272 * stealing an rbio means taking all the uptodate pages from the stripe
273 * array in the source rbio and putting them into the destination rbio
274 */
275static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
276{
277 int i;
278 struct page *s;
279 struct page *d;
280
281 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
282 return;
283
284 for (i = 0; i < dest->nr_pages; i++) {
285 s = src->stripe_pages[i];
286 if (!s || !PageUptodate(s)) {
287 continue;
288 }
289
290 d = dest->stripe_pages[i];
291 if (d)
292 __free_page(d);
293
294 dest->stripe_pages[i] = s;
295 src->stripe_pages[i] = NULL;
296 }
297}
298
299/*
300 * merging means we take the bio_list from the victim and
301 * splice it into the destination. The victim should
302 * be discarded afterwards.
303 *
304 * must be called with dest->rbio_list_lock held
305 */
306static void merge_rbio(struct btrfs_raid_bio *dest,
307 struct btrfs_raid_bio *victim)
308{
309 bio_list_merge(&dest->bio_list, &victim->bio_list);
310 dest->bio_list_bytes += victim->bio_list_bytes;
311 bio_list_init(&victim->bio_list);
312}
313
314/*
315 * used to prune items that are in the cache. The caller
316 * must hold the hash table lock.
317 */
318static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
319{
320 int bucket = rbio_bucket(rbio);
321 struct btrfs_stripe_hash_table *table;
322 struct btrfs_stripe_hash *h;
323 int freeit = 0;
324
325 /*
326 * check the bit again under the hash table lock.
327 */
328 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
329 return;
330
331 table = rbio->fs_info->stripe_hash_table;
332 h = table->table + bucket;
333
334 /* hold the lock for the bucket because we may be
335 * removing it from the hash table
336 */
337 spin_lock(&h->lock);
338
339 /*
340 * hold the lock for the bio list because we need
341 * to make sure the bio list is empty
342 */
343 spin_lock(&rbio->bio_list_lock);
344
345 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
346 list_del_init(&rbio->stripe_cache);
347 table->cache_size -= 1;
348 freeit = 1;
349
350 /* if the bio list isn't empty, this rbio is
351 * still involved in an IO. We take it out
352 * of the cache list, and drop the ref that
353 * was held for the list.
354 *
355 * If the bio_list was empty, we also remove
356 * the rbio from the hash_table, and drop
357 * the corresponding ref
358 */
359 if (bio_list_empty(&rbio->bio_list)) {
360 if (!list_empty(&rbio->hash_list)) {
361 list_del_init(&rbio->hash_list);
362 atomic_dec(&rbio->refs);
363 BUG_ON(!list_empty(&rbio->plug_list));
364 }
365 }
366 }
367
368 spin_unlock(&rbio->bio_list_lock);
369 spin_unlock(&h->lock);
370
371 if (freeit)
372 __free_raid_bio(rbio);
373}
374
375/*
376 * prune a given rbio from the cache
377 */
378static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
379{
380 struct btrfs_stripe_hash_table *table;
381 unsigned long flags;
382
383 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
384 return;
385
386 table = rbio->fs_info->stripe_hash_table;
387
388 spin_lock_irqsave(&table->cache_lock, flags);
389 __remove_rbio_from_cache(rbio);
390 spin_unlock_irqrestore(&table->cache_lock, flags);
391}
392
393/*
394 * remove everything in the cache
395 */
396void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
397{
398 struct btrfs_stripe_hash_table *table;
399 unsigned long flags;
400 struct btrfs_raid_bio *rbio;
401
402 table = info->stripe_hash_table;
403
404 spin_lock_irqsave(&table->cache_lock, flags);
405 while (!list_empty(&table->stripe_cache)) {
406 rbio = list_entry(table->stripe_cache.next,
407 struct btrfs_raid_bio,
408 stripe_cache);
409 __remove_rbio_from_cache(rbio);
410 }
411 spin_unlock_irqrestore(&table->cache_lock, flags);
412}
413
414/*
415 * remove all cached entries and free the hash table
416 * used by unmount
417 */
418void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
419{
420 if (!info->stripe_hash_table)
421 return;
422 btrfs_clear_rbio_cache(info);
423 kfree(info->stripe_hash_table);
424 info->stripe_hash_table = NULL;
425}
426
427/*
428 * insert an rbio into the stripe cache. It
429 * must have already been prepared by calling
430 * cache_rbio_pages
431 *
432 * If this rbio was already cached, it gets
433 * moved to the front of the lru.
434 *
435 * If the size of the rbio cache is too big, we
436 * prune an item.
437 */
438static void cache_rbio(struct btrfs_raid_bio *rbio)
439{
440 struct btrfs_stripe_hash_table *table;
441 unsigned long flags;
442
443 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
444 return;
445
446 table = rbio->fs_info->stripe_hash_table;
447
448 spin_lock_irqsave(&table->cache_lock, flags);
449 spin_lock(&rbio->bio_list_lock);
450
451 /* bump our ref if we were not in the list before */
452 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
453 atomic_inc(&rbio->refs);
454
455 if (!list_empty(&rbio->stripe_cache)){
456 list_move(&rbio->stripe_cache, &table->stripe_cache);
457 } else {
458 list_add(&rbio->stripe_cache, &table->stripe_cache);
459 table->cache_size += 1;
460 }
461
462 spin_unlock(&rbio->bio_list_lock);
463
464 if (table->cache_size > RBIO_CACHE_SIZE) {
465 struct btrfs_raid_bio *found;
466
467 found = list_entry(table->stripe_cache.prev,
468 struct btrfs_raid_bio,
469 stripe_cache);
470
471 if (found != rbio)
472 __remove_rbio_from_cache(found);
473 }
474
475 spin_unlock_irqrestore(&table->cache_lock, flags);
476 return;
477}
478
479/*
480 * helper function to run the xor_blocks api. It is only
481 * able to do MAX_XOR_BLOCKS at a time, so we need to
482 * loop through.
483 */
484static void run_xor(void **pages, int src_cnt, ssize_t len)
485{
486 int src_off = 0;
487 int xor_src_cnt = 0;
488 void *dest = pages[src_cnt];
489
490 while(src_cnt > 0) {
491 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
492 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
493
494 src_cnt -= xor_src_cnt;
495 src_off += xor_src_cnt;
496 }
497}
498
499/*
500 * returns true if the bio list inside this rbio
501 * covers an entire stripe (no rmw required).
502 * Must be called with the bio list lock held, or
503 * at a time when you know it is impossible to add
504 * new bios into the list
505 */
506static int __rbio_is_full(struct btrfs_raid_bio *rbio)
507{
508 unsigned long size = rbio->bio_list_bytes;
509 int ret = 1;
510
511 if (size != rbio->nr_data * rbio->stripe_len)
512 ret = 0;
513
514 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
515 return ret;
516}
517
518static int rbio_is_full(struct btrfs_raid_bio *rbio)
519{
520 unsigned long flags;
521 int ret;
522
523 spin_lock_irqsave(&rbio->bio_list_lock, flags);
524 ret = __rbio_is_full(rbio);
525 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
526 return ret;
527}
528
529/*
530 * returns 1 if it is safe to merge two rbios together.
531 * The merging is safe if the two rbios correspond to
532 * the same stripe and if they are both going in the same
533 * direction (read vs write), and if neither one is
534 * locked for final IO
535 *
536 * The caller is responsible for locking such that
537 * rmw_locked is safe to test
538 */
539static int rbio_can_merge(struct btrfs_raid_bio *last,
540 struct btrfs_raid_bio *cur)
541{
542 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
543 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
544 return 0;
545
546 /*
547 * we can't merge with cached rbios, since the
548 * idea is that when we merge the destination
549 * rbio is going to run our IO for us. We can
550 * steal from cached rbio's though, other functions
551 * handle that.
552 */
553 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
554 test_bit(RBIO_CACHE_BIT, &cur->flags))
555 return 0;
556
557 if (last->raid_map[0] !=
558 cur->raid_map[0])
559 return 0;
560
561 /* reads can't merge with writes */
562 if (last->read_rebuild !=
563 cur->read_rebuild) {
564 return 0;
565 }
566
567 return 1;
568}
569
570/*
571 * helper to index into the pstripe
572 */
573static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
574{
575 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
576 return rbio->stripe_pages[index];
577}
578
579/*
580 * helper to index into the qstripe, returns null
581 * if there is no qstripe
582 */
583static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
584{
585 if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
586 return NULL;
587
588 index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
589 PAGE_CACHE_SHIFT;
590 return rbio->stripe_pages[index];
591}
592
593/*
594 * The first stripe in the table for a logical address
595 * has the lock. rbios are added in one of three ways:
596 *
597 * 1) Nobody has the stripe locked yet. The rbio is given
598 * the lock and 0 is returned. The caller must start the IO
599 * themselves.
600 *
601 * 2) Someone has the stripe locked, but we're able to merge
602 * with the lock owner. The rbio is freed and the IO will
603 * start automatically along with the existing rbio. 1 is returned.
604 *
605 * 3) Someone has the stripe locked, but we're not able to merge.
606 * The rbio is added to the lock owner's plug list, or merged into
607 * an rbio already on the plug list. When the lock owner unlocks,
608 * the next rbio on the list is run and the IO is started automatically.
609 * 1 is returned
610 *
611 * If we return 0, the caller still owns the rbio and must continue with
612 * IO submission. If we return 1, the caller must assume the rbio has
613 * already been freed.
614 */
615static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
616{
617 int bucket = rbio_bucket(rbio);
618 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
619 struct btrfs_raid_bio *cur;
620 struct btrfs_raid_bio *pending;
621 unsigned long flags;
622 DEFINE_WAIT(wait);
623 struct btrfs_raid_bio *freeit = NULL;
624 struct btrfs_raid_bio *cache_drop = NULL;
625 int ret = 0;
626 int walk = 0;
627
628 spin_lock_irqsave(&h->lock, flags);
629 list_for_each_entry(cur, &h->hash_list, hash_list) {
630 walk++;
631 if (cur->raid_map[0] == rbio->raid_map[0]) {
632 spin_lock(&cur->bio_list_lock);
633
634 /* can we steal this cached rbio's pages? */
635 if (bio_list_empty(&cur->bio_list) &&
636 list_empty(&cur->plug_list) &&
637 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
638 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
639 list_del_init(&cur->hash_list);
640 atomic_dec(&cur->refs);
641
642 steal_rbio(cur, rbio);
643 cache_drop = cur;
644 spin_unlock(&cur->bio_list_lock);
645
646 goto lockit;
647 }
648
649 /* can we merge into the lock owner? */
650 if (rbio_can_merge(cur, rbio)) {
651 merge_rbio(cur, rbio);
652 spin_unlock(&cur->bio_list_lock);
653 freeit = rbio;
654 ret = 1;
655 goto out;
656 }
657
658
659 /*
660 * we couldn't merge with the running
661 * rbio, see if we can merge with the
662 * pending ones. We don't have to
663 * check for rmw_locked because there
664 * is no way they are inside finish_rmw
665 * right now
666 */
667 list_for_each_entry(pending, &cur->plug_list,
668 plug_list) {
669 if (rbio_can_merge(pending, rbio)) {
670 merge_rbio(pending, rbio);
671 spin_unlock(&cur->bio_list_lock);
672 freeit = rbio;
673 ret = 1;
674 goto out;
675 }
676 }
677
678 /* no merging, put us on the tail of the plug list,
679 * our rbio will be started with the currently
680 * running rbio unlocks
681 */
682 list_add_tail(&rbio->plug_list, &cur->plug_list);
683 spin_unlock(&cur->bio_list_lock);
684 ret = 1;
685 goto out;
686 }
687 }
688lockit:
689 atomic_inc(&rbio->refs);
690 list_add(&rbio->hash_list, &h->hash_list);
691out:
692 spin_unlock_irqrestore(&h->lock, flags);
693 if (cache_drop)
694 remove_rbio_from_cache(cache_drop);
695 if (freeit)
696 __free_raid_bio(freeit);
697 return ret;
698}
699
700/*
701 * called as rmw or parity rebuild is completed. If the plug list has more
702 * rbios waiting for this stripe, the next one on the list will be started
703 */
704static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
705{
706 int bucket;
707 struct btrfs_stripe_hash *h;
708 unsigned long flags;
709 int keep_cache = 0;
710
711 bucket = rbio_bucket(rbio);
712 h = rbio->fs_info->stripe_hash_table->table + bucket;
713
714 if (list_empty(&rbio->plug_list))
715 cache_rbio(rbio);
716
717 spin_lock_irqsave(&h->lock, flags);
718 spin_lock(&rbio->bio_list_lock);
719
720 if (!list_empty(&rbio->hash_list)) {
721 /*
722 * if we're still cached and there is no other IO
723 * to perform, just leave this rbio here for others
724 * to steal from later
725 */
726 if (list_empty(&rbio->plug_list) &&
727 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
728 keep_cache = 1;
729 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
730 BUG_ON(!bio_list_empty(&rbio->bio_list));
731 goto done;
732 }
733
734 list_del_init(&rbio->hash_list);
735 atomic_dec(&rbio->refs);
736
737 /*
738 * we use the plug list to hold all the rbios
739 * waiting for the chance to lock this stripe.
740 * hand the lock over to one of them.
741 */
742 if (!list_empty(&rbio->plug_list)) {
743 struct btrfs_raid_bio *next;
744 struct list_head *head = rbio->plug_list.next;
745
746 next = list_entry(head, struct btrfs_raid_bio,
747 plug_list);
748
749 list_del_init(&rbio->plug_list);
750
751 list_add(&next->hash_list, &h->hash_list);
752 atomic_inc(&next->refs);
753 spin_unlock(&rbio->bio_list_lock);
754 spin_unlock_irqrestore(&h->lock, flags);
755
756 if (next->read_rebuild)
757 async_read_rebuild(next);
758 else {
759 steal_rbio(rbio, next);
760 async_rmw_stripe(next);
761 }
762
763 goto done_nolock;
764 } else if (waitqueue_active(&h->wait)) {
765 spin_unlock(&rbio->bio_list_lock);
766 spin_unlock_irqrestore(&h->lock, flags);
767 wake_up(&h->wait);
768 goto done_nolock;
769 }
770 }
771done:
772 spin_unlock(&rbio->bio_list_lock);
773 spin_unlock_irqrestore(&h->lock, flags);
774
775done_nolock:
776 if (!keep_cache)
777 remove_rbio_from_cache(rbio);
778}
779
780static void __free_raid_bio(struct btrfs_raid_bio *rbio)
781{
782 int i;
783
784 WARN_ON(atomic_read(&rbio->refs) < 0);
785 if (!atomic_dec_and_test(&rbio->refs))
786 return;
787
788 WARN_ON(!list_empty(&rbio->stripe_cache));
789 WARN_ON(!list_empty(&rbio->hash_list));
790 WARN_ON(!bio_list_empty(&rbio->bio_list));
791
792 for (i = 0; i < rbio->nr_pages; i++) {
793 if (rbio->stripe_pages[i]) {
794 __free_page(rbio->stripe_pages[i]);
795 rbio->stripe_pages[i] = NULL;
796 }
797 }
798 kfree(rbio->raid_map);
799 kfree(rbio->bbio);
800 kfree(rbio);
801}
802
803static void free_raid_bio(struct btrfs_raid_bio *rbio)
804{
805 unlock_stripe(rbio);
806 __free_raid_bio(rbio);
807}
808
809/*
810 * this frees the rbio and runs through all the bios in the
811 * bio_list and calls end_io on them
812 */
813static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
814{
815 struct bio *cur = bio_list_get(&rbio->bio_list);
816 struct bio *next;
817 free_raid_bio(rbio);
818
819 while (cur) {
820 next = cur->bi_next;
821 cur->bi_next = NULL;
822 if (uptodate)
823 set_bit(BIO_UPTODATE, &cur->bi_flags);
824 bio_endio(cur, err);
825 cur = next;
826 }
827}
828
829/*
830 * end io function used by finish_rmw. When we finally
831 * get here, we've written a full stripe
832 */
833static void raid_write_end_io(struct bio *bio, int err)
834{
835 struct btrfs_raid_bio *rbio = bio->bi_private;
836
837 if (err)
838 fail_bio_stripe(rbio, bio);
839
840 bio_put(bio);
841
842 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
843 return;
844
845 err = 0;
846
847 /* OK, we have read all the stripes we need to. */
848 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
849 err = -EIO;
850
851 rbio_orig_end_io(rbio, err, 0);
852 return;
853}
854
855/*
856 * the read/modify/write code wants to use the original bio for
857 * any pages it included, and then use the rbio for everything
858 * else. This function decides if a given index (stripe number)
859 * and page number in that stripe fall inside the original bio
860 * or the rbio.
861 *
862 * if you set bio_list_only, you'll get a NULL back for any ranges
863 * that are outside the bio_list
864 *
865 * This doesn't take any refs on anything, you get a bare page pointer
866 * and the caller must bump refs as required.
867 *
868 * You must call index_rbio_pages once before you can trust
869 * the answers from this function.
870 */
871static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
872 int index, int pagenr, int bio_list_only)
873{
874 int chunk_page;
875 struct page *p = NULL;
876
877 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
878
879 spin_lock_irq(&rbio->bio_list_lock);
880 p = rbio->bio_pages[chunk_page];
881 spin_unlock_irq(&rbio->bio_list_lock);
882
883 if (p || bio_list_only)
884 return p;
885
886 return rbio->stripe_pages[chunk_page];
887}
888
889/*
890 * number of pages we need for the entire stripe across all the
891 * drives
892 */
893static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
894{
895 unsigned long nr = stripe_len * nr_stripes;
896 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
897}
898
899/*
900 * allocation and initial setup for the btrfs_raid_bio. Not
901 * this does not allocate any pages for rbio->pages.
902 */
903static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
904 struct btrfs_bio *bbio, u64 *raid_map,
905 u64 stripe_len)
906{
907 struct btrfs_raid_bio *rbio;
908 int nr_data = 0;
909 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
910 void *p;
911
912 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
913 GFP_NOFS);
914 if (!rbio) {
915 kfree(raid_map);
916 kfree(bbio);
917 return ERR_PTR(-ENOMEM);
918 }
919
920 bio_list_init(&rbio->bio_list);
921 INIT_LIST_HEAD(&rbio->plug_list);
922 spin_lock_init(&rbio->bio_list_lock);
923 INIT_LIST_HEAD(&rbio->stripe_cache);
924 INIT_LIST_HEAD(&rbio->hash_list);
925 rbio->bbio = bbio;
926 rbio->raid_map = raid_map;
927 rbio->fs_info = root->fs_info;
928 rbio->stripe_len = stripe_len;
929 rbio->nr_pages = num_pages;
930 rbio->faila = -1;
931 rbio->failb = -1;
932 atomic_set(&rbio->refs, 1);
933
934 /*
935 * the stripe_pages and bio_pages array point to the extra
936 * memory we allocated past the end of the rbio
937 */
938 p = rbio + 1;
939 rbio->stripe_pages = p;
940 rbio->bio_pages = p + sizeof(struct page *) * num_pages;
941
942 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
943 nr_data = bbio->num_stripes - 2;
944 else
945 nr_data = bbio->num_stripes - 1;
946
947 rbio->nr_data = nr_data;
948 return rbio;
949}
950
951/* allocate pages for all the stripes in the bio, including parity */
952static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
953{
954 int i;
955 struct page *page;
956
957 for (i = 0; i < rbio->nr_pages; i++) {
958 if (rbio->stripe_pages[i])
959 continue;
960 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
961 if (!page)
962 return -ENOMEM;
963 rbio->stripe_pages[i] = page;
964 ClearPageUptodate(page);
965 }
966 return 0;
967}
968
969/* allocate pages for just the p/q stripes */
970static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
971{
972 int i;
973 struct page *page;
974
975 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
976
977 for (; i < rbio->nr_pages; i++) {
978 if (rbio->stripe_pages[i])
979 continue;
980 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
981 if (!page)
982 return -ENOMEM;
983 rbio->stripe_pages[i] = page;
984 }
985 return 0;
986}
987
988/*
989 * add a single page from a specific stripe into our list of bios for IO
990 * this will try to merge into existing bios if possible, and returns
991 * zero if all went well.
992 */
993int rbio_add_io_page(struct btrfs_raid_bio *rbio,
994 struct bio_list *bio_list,
995 struct page *page,
996 int stripe_nr,
997 unsigned long page_index,
998 unsigned long bio_max_len)
999{
1000 struct bio *last = bio_list->tail;
1001 u64 last_end = 0;
1002 int ret;
1003 struct bio *bio;
1004 struct btrfs_bio_stripe *stripe;
1005 u64 disk_start;
1006
1007 stripe = &rbio->bbio->stripes[stripe_nr];
1008 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1009
1010 /* if the device is missing, just fail this stripe */
1011 if (!stripe->dev->bdev)
1012 return fail_rbio_index(rbio, stripe_nr);
1013
1014 /* see if we can add this page onto our existing bio */
1015 if (last) {
1016 last_end = (u64)last->bi_sector << 9;
1017 last_end += last->bi_size;
1018
1019 /*
1020 * we can't merge these if they are from different
1021 * devices or if they are not contiguous
1022 */
1023 if (last_end == disk_start && stripe->dev->bdev &&
1024 test_bit(BIO_UPTODATE, &last->bi_flags) &&
1025 last->bi_bdev == stripe->dev->bdev) {
1026 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1027 if (ret == PAGE_CACHE_SIZE)
1028 return 0;
1029 }
1030 }
1031
1032 /* put a new bio on the list */
1033 bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1034 if (!bio)
1035 return -ENOMEM;
1036
1037 bio->bi_size = 0;
1038 bio->bi_bdev = stripe->dev->bdev;
1039 bio->bi_sector = disk_start >> 9;
1040 set_bit(BIO_UPTODATE, &bio->bi_flags);
1041
1042 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1043 bio_list_add(bio_list, bio);
1044 return 0;
1045}
1046
1047/*
1048 * while we're doing the read/modify/write cycle, we could
1049 * have errors in reading pages off the disk. This checks
1050 * for errors and if we're not able to read the page it'll
1051 * trigger parity reconstruction. The rmw will be finished
1052 * after we've reconstructed the failed stripes
1053 */
1054static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1055{
1056 if (rbio->faila >= 0 || rbio->failb >= 0) {
1057 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1058 __raid56_parity_recover(rbio);
1059 } else {
1060 finish_rmw(rbio);
1061 }
1062}
1063
1064/*
1065 * these are just the pages from the rbio array, not from anything
1066 * the FS sent down to us
1067 */
1068static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1069{
1070 int index;
1071 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1072 index += page;
1073 return rbio->stripe_pages[index];
1074}
1075
1076/*
1077 * helper function to walk our bio list and populate the bio_pages array with
1078 * the result. This seems expensive, but it is faster than constantly
1079 * searching through the bio list as we setup the IO in finish_rmw or stripe
1080 * reconstruction.
1081 *
1082 * This must be called before you trust the answers from page_in_rbio
1083 */
1084static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1085{
1086 struct bio *bio;
1087 u64 start;
1088 unsigned long stripe_offset;
1089 unsigned long page_index;
1090 struct page *p;
1091 int i;
1092
1093 spin_lock_irq(&rbio->bio_list_lock);
1094 bio_list_for_each(bio, &rbio->bio_list) {
1095 start = (u64)bio->bi_sector << 9;
1096 stripe_offset = start - rbio->raid_map[0];
1097 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1098
1099 for (i = 0; i < bio->bi_vcnt; i++) {
1100 p = bio->bi_io_vec[i].bv_page;
1101 rbio->bio_pages[page_index + i] = p;
1102 }
1103 }
1104 spin_unlock_irq(&rbio->bio_list_lock);
1105}
1106
1107/*
1108 * this is called from one of two situations. We either
1109 * have a full stripe from the higher layers, or we've read all
1110 * the missing bits off disk.
1111 *
1112 * This will calculate the parity and then send down any
1113 * changed blocks.
1114 */
1115static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1116{
1117 struct btrfs_bio *bbio = rbio->bbio;
1118 void *pointers[bbio->num_stripes];
1119 int stripe_len = rbio->stripe_len;
1120 int nr_data = rbio->nr_data;
1121 int stripe;
1122 int pagenr;
1123 int p_stripe = -1;
1124 int q_stripe = -1;
1125 struct bio_list bio_list;
1126 struct bio *bio;
1127 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1128 int ret;
1129
1130 bio_list_init(&bio_list);
1131
1132 if (bbio->num_stripes - rbio->nr_data == 1) {
1133 p_stripe = bbio->num_stripes - 1;
1134 } else if (bbio->num_stripes - rbio->nr_data == 2) {
1135 p_stripe = bbio->num_stripes - 2;
1136 q_stripe = bbio->num_stripes - 1;
1137 } else {
1138 BUG();
1139 }
1140
1141 /* at this point we either have a full stripe,
1142 * or we've read the full stripe from the drive.
1143 * recalculate the parity and write the new results.
1144 *
1145 * We're not allowed to add any new bios to the
1146 * bio list here, anyone else that wants to
1147 * change this stripe needs to do their own rmw.
1148 */
1149 spin_lock_irq(&rbio->bio_list_lock);
1150 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1151 spin_unlock_irq(&rbio->bio_list_lock);
1152
1153 atomic_set(&rbio->bbio->error, 0);
1154
1155 /*
1156 * now that we've set rmw_locked, run through the
1157 * bio list one last time and map the page pointers
1158 *
1159 * We don't cache full rbios because we're assuming
1160 * the higher layers are unlikely to use this area of
1161 * the disk again soon. If they do use it again,
1162 * hopefully they will send another full bio.
1163 */
1164 index_rbio_pages(rbio);
1165 if (!rbio_is_full(rbio))
1166 cache_rbio_pages(rbio);
1167 else
1168 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1169
1170 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1171 struct page *p;
1172 /* first collect one page from each data stripe */
1173 for (stripe = 0; stripe < nr_data; stripe++) {
1174 p = page_in_rbio(rbio, stripe, pagenr, 0);
1175 pointers[stripe] = kmap(p);
1176 }
1177
1178 /* then add the parity stripe */
1179 p = rbio_pstripe_page(rbio, pagenr);
1180 SetPageUptodate(p);
1181 pointers[stripe++] = kmap(p);
1182
1183 if (q_stripe != -1) {
1184
1185 /*
1186 * raid6, add the qstripe and call the
1187 * library function to fill in our p/q
1188 */
1189 p = rbio_qstripe_page(rbio, pagenr);
1190 SetPageUptodate(p);
1191 pointers[stripe++] = kmap(p);
1192
1193 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1194 pointers);
1195 } else {
1196 /* raid5 */
1197 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1198 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1199 }
1200
1201
1202 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1203 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1204 }
1205
1206 /*
1207 * time to start writing. Make bios for everything from the
1208 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1209 * everything else.
1210 */
1211 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1212 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1213 struct page *page;
1214 if (stripe < rbio->nr_data) {
1215 page = page_in_rbio(rbio, stripe, pagenr, 1);
1216 if (!page)
1217 continue;
1218 } else {
1219 page = rbio_stripe_page(rbio, stripe, pagenr);
1220 }
1221
1222 ret = rbio_add_io_page(rbio, &bio_list,
1223 page, stripe, pagenr, rbio->stripe_len);
1224 if (ret)
1225 goto cleanup;
1226 }
1227 }
1228
1229 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1230 BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1231
1232 while (1) {
1233 bio = bio_list_pop(&bio_list);
1234 if (!bio)
1235 break;
1236
1237 bio->bi_private = rbio;
1238 bio->bi_end_io = raid_write_end_io;
1239 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1240 submit_bio(WRITE, bio);
1241 }
1242 return;
1243
1244cleanup:
1245 rbio_orig_end_io(rbio, -EIO, 0);
1246}
1247
1248/*
1249 * helper to find the stripe number for a given bio. Used to figure out which
1250 * stripe has failed. This expects the bio to correspond to a physical disk,
1251 * so it looks up based on physical sector numbers.
1252 */
1253static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1254 struct bio *bio)
1255{
1256 u64 physical = bio->bi_sector;
1257 u64 stripe_start;
1258 int i;
1259 struct btrfs_bio_stripe *stripe;
1260
1261 physical <<= 9;
1262
1263 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1264 stripe = &rbio->bbio->stripes[i];
1265 stripe_start = stripe->physical;
1266 if (physical >= stripe_start &&
1267 physical < stripe_start + rbio->stripe_len) {
1268 return i;
1269 }
1270 }
1271 return -1;
1272}
1273
1274/*
1275 * helper to find the stripe number for a given
1276 * bio (before mapping). Used to figure out which stripe has
1277 * failed. This looks up based on logical block numbers.
1278 */
1279static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1280 struct bio *bio)
1281{
1282 u64 logical = bio->bi_sector;
1283 u64 stripe_start;
1284 int i;
1285
1286 logical <<= 9;
1287
1288 for (i = 0; i < rbio->nr_data; i++) {
1289 stripe_start = rbio->raid_map[i];
1290 if (logical >= stripe_start &&
1291 logical < stripe_start + rbio->stripe_len) {
1292 return i;
1293 }
1294 }
1295 return -1;
1296}
1297
1298/*
1299 * returns -EIO if we had too many failures
1300 */
1301static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1302{
1303 unsigned long flags;
1304 int ret = 0;
1305
1306 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1307
1308 /* we already know this stripe is bad, move on */
1309 if (rbio->faila == failed || rbio->failb == failed)
1310 goto out;
1311
1312 if (rbio->faila == -1) {
1313 /* first failure on this rbio */
1314 rbio->faila = failed;
1315 atomic_inc(&rbio->bbio->error);
1316 } else if (rbio->failb == -1) {
1317 /* second failure on this rbio */
1318 rbio->failb = failed;
1319 atomic_inc(&rbio->bbio->error);
1320 } else {
1321 ret = -EIO;
1322 }
1323out:
1324 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1325
1326 return ret;
1327}
1328
1329/*
1330 * helper to fail a stripe based on a physical disk
1331 * bio.
1332 */
1333static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1334 struct bio *bio)
1335{
1336 int failed = find_bio_stripe(rbio, bio);
1337
1338 if (failed < 0)
1339 return -EIO;
1340
1341 return fail_rbio_index(rbio, failed);
1342}
1343
1344/*
1345 * this sets each page in the bio uptodate. It should only be used on private
1346 * rbio pages, nothing that comes in from the higher layers
1347 */
1348static void set_bio_pages_uptodate(struct bio *bio)
1349{
1350 int i;
1351 struct page *p;
1352
1353 for (i = 0; i < bio->bi_vcnt; i++) {
1354 p = bio->bi_io_vec[i].bv_page;
1355 SetPageUptodate(p);
1356 }
1357}
1358
1359/*
1360 * end io for the read phase of the rmw cycle. All the bios here are physical
1361 * stripe bios we've read from the disk so we can recalculate the parity of the
1362 * stripe.
1363 *
1364 * This will usually kick off finish_rmw once all the bios are read in, but it
1365 * may trigger parity reconstruction if we had any errors along the way
1366 */
1367static void raid_rmw_end_io(struct bio *bio, int err)
1368{
1369 struct btrfs_raid_bio *rbio = bio->bi_private;
1370
1371 if (err)
1372 fail_bio_stripe(rbio, bio);
1373 else
1374 set_bio_pages_uptodate(bio);
1375
1376 bio_put(bio);
1377
1378 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1379 return;
1380
1381 err = 0;
1382 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1383 goto cleanup;
1384
1385 /*
1386 * this will normally call finish_rmw to start our write
1387 * but if there are any failed stripes we'll reconstruct
1388 * from parity first
1389 */
1390 validate_rbio_for_rmw(rbio);
1391 return;
1392
1393cleanup:
1394
1395 rbio_orig_end_io(rbio, -EIO, 0);
1396}
1397
1398static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1399{
1400 rbio->work.flags = 0;
1401 rbio->work.func = rmw_work;
1402
1403 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1404 &rbio->work);
1405}
1406
1407static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1408{
1409 rbio->work.flags = 0;
1410 rbio->work.func = read_rebuild_work;
1411
1412 btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1413 &rbio->work);
1414}
1415
1416/*
1417 * the stripe must be locked by the caller. It will
1418 * unlock after all the writes are done
1419 */
1420static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1421{
1422 int bios_to_read = 0;
1423 struct btrfs_bio *bbio = rbio->bbio;
1424 struct bio_list bio_list;
1425 int ret;
1426 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1427 int pagenr;
1428 int stripe;
1429 struct bio *bio;
1430
1431 bio_list_init(&bio_list);
1432
1433 ret = alloc_rbio_pages(rbio);
1434 if (ret)
1435 goto cleanup;
1436
1437 index_rbio_pages(rbio);
1438
1439 atomic_set(&rbio->bbio->error, 0);
1440 /*
1441 * build a list of bios to read all the missing parts of this
1442 * stripe
1443 */
1444 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1445 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1446 struct page *page;
1447 /*
1448 * we want to find all the pages missing from
1449 * the rbio and read them from the disk. If
1450 * page_in_rbio finds a page in the bio list
1451 * we don't need to read it off the stripe.
1452 */
1453 page = page_in_rbio(rbio, stripe, pagenr, 1);
1454 if (page)
1455 continue;
1456
1457 page = rbio_stripe_page(rbio, stripe, pagenr);
1458 /*
1459 * the bio cache may have handed us an uptodate
1460 * page. If so, be happy and use it
1461 */
1462 if (PageUptodate(page))
1463 continue;
1464
1465 ret = rbio_add_io_page(rbio, &bio_list, page,
1466 stripe, pagenr, rbio->stripe_len);
1467 if (ret)
1468 goto cleanup;
1469 }
1470 }
1471
1472 bios_to_read = bio_list_size(&bio_list);
1473 if (!bios_to_read) {
1474 /*
1475 * this can happen if others have merged with
1476 * us, it means there is nothing left to read.
1477 * But if there are missing devices it may not be
1478 * safe to do the full stripe write yet.
1479 */
1480 goto finish;
1481 }
1482
1483 /*
1484 * the bbio may be freed once we submit the last bio. Make sure
1485 * not to touch it after that
1486 */
1487 atomic_set(&bbio->stripes_pending, bios_to_read);
1488 while (1) {
1489 bio = bio_list_pop(&bio_list);
1490 if (!bio)
1491 break;
1492
1493 bio->bi_private = rbio;
1494 bio->bi_end_io = raid_rmw_end_io;
1495
1496 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1497 BTRFS_WQ_ENDIO_RAID56);
1498
1499 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1500 submit_bio(READ, bio);
1501 }
1502 /* the actual write will happen once the reads are done */
1503 return 0;
1504
1505cleanup:
1506 rbio_orig_end_io(rbio, -EIO, 0);
1507 return -EIO;
1508
1509finish:
1510 validate_rbio_for_rmw(rbio);
1511 return 0;
1512}
1513
1514/*
1515 * if the upper layers pass in a full stripe, we thank them by only allocating
1516 * enough pages to hold the parity, and sending it all down quickly.
1517 */
1518static int full_stripe_write(struct btrfs_raid_bio *rbio)
1519{
1520 int ret;
1521
1522 ret = alloc_rbio_parity_pages(rbio);
1523 if (ret)
1524 return ret;
1525
1526 ret = lock_stripe_add(rbio);
1527 if (ret == 0)
1528 finish_rmw(rbio);
1529 return 0;
1530}
1531
1532/*
1533 * partial stripe writes get handed over to async helpers.
1534 * We're really hoping to merge a few more writes into this
1535 * rbio before calculating new parity
1536 */
1537static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1538{
1539 int ret;
1540
1541 ret = lock_stripe_add(rbio);
1542 if (ret == 0)
1543 async_rmw_stripe(rbio);
1544 return 0;
1545}
1546
1547/*
1548 * sometimes while we were reading from the drive to
1549 * recalculate parity, enough new bios come into create
1550 * a full stripe. So we do a check here to see if we can
1551 * go directly to finish_rmw
1552 */
1553static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1554{
1555 /* head off into rmw land if we don't have a full stripe */
1556 if (!rbio_is_full(rbio))
1557 return partial_stripe_write(rbio);
1558 return full_stripe_write(rbio);
1559}
1560
1561/*
1562 * We use plugging call backs to collect full stripes.
1563 * Any time we get a partial stripe write while plugged
1564 * we collect it into a list. When the unplug comes down,
1565 * we sort the list by logical block number and merge
1566 * everything we can into the same rbios
1567 */
1568struct btrfs_plug_cb {
1569 struct blk_plug_cb cb;
1570 struct btrfs_fs_info *info;
1571 struct list_head rbio_list;
1572 struct btrfs_work work;
1573};
1574
1575/*
1576 * rbios on the plug list are sorted for easier merging.
1577 */
1578static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1579{
1580 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1581 plug_list);
1582 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1583 plug_list);
1584 u64 a_sector = ra->bio_list.head->bi_sector;
1585 u64 b_sector = rb->bio_list.head->bi_sector;
1586
1587 if (a_sector < b_sector)
1588 return -1;
1589 if (a_sector > b_sector)
1590 return 1;
1591 return 0;
1592}
1593
1594static void run_plug(struct btrfs_plug_cb *plug)
1595{
1596 struct btrfs_raid_bio *cur;
1597 struct btrfs_raid_bio *last = NULL;
1598
1599 /*
1600 * sort our plug list then try to merge
1601 * everything we can in hopes of creating full
1602 * stripes.
1603 */
1604 list_sort(NULL, &plug->rbio_list, plug_cmp);
1605 while (!list_empty(&plug->rbio_list)) {
1606 cur = list_entry(plug->rbio_list.next,
1607 struct btrfs_raid_bio, plug_list);
1608 list_del_init(&cur->plug_list);
1609
1610 if (rbio_is_full(cur)) {
1611 /* we have a full stripe, send it down */
1612 full_stripe_write(cur);
1613 continue;
1614 }
1615 if (last) {
1616 if (rbio_can_merge(last, cur)) {
1617 merge_rbio(last, cur);
1618 __free_raid_bio(cur);
1619 continue;
1620
1621 }
1622 __raid56_parity_write(last);
1623 }
1624 last = cur;
1625 }
1626 if (last) {
1627 __raid56_parity_write(last);
1628 }
1629 kfree(plug);
1630}
1631
1632/*
1633 * if the unplug comes from schedule, we have to push the
1634 * work off to a helper thread
1635 */
1636static void unplug_work(struct btrfs_work *work)
1637{
1638 struct btrfs_plug_cb *plug;
1639 plug = container_of(work, struct btrfs_plug_cb, work);
1640 run_plug(plug);
1641}
1642
1643static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1644{
1645 struct btrfs_plug_cb *plug;
1646 plug = container_of(cb, struct btrfs_plug_cb, cb);
1647
1648 if (from_schedule) {
1649 plug->work.flags = 0;
1650 plug->work.func = unplug_work;
1651 btrfs_queue_worker(&plug->info->rmw_workers,
1652 &plug->work);
1653 return;
1654 }
1655 run_plug(plug);
1656}
1657
1658/*
1659 * our main entry point for writes from the rest of the FS.
1660 */
1661int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1662 struct btrfs_bio *bbio, u64 *raid_map,
1663 u64 stripe_len)
1664{
1665 struct btrfs_raid_bio *rbio;
1666 struct btrfs_plug_cb *plug = NULL;
1667 struct blk_plug_cb *cb;
1668
1669 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1670 if (IS_ERR(rbio)) {
1671 kfree(raid_map);
1672 kfree(bbio);
1673 return PTR_ERR(rbio);
1674 }
1675 bio_list_add(&rbio->bio_list, bio);
1676 rbio->bio_list_bytes = bio->bi_size;
1677
1678 /*
1679 * don't plug on full rbios, just get them out the door
1680 * as quickly as we can
1681 */
1682 if (rbio_is_full(rbio))
1683 return full_stripe_write(rbio);
1684
1685 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1686 sizeof(*plug));
1687 if (cb) {
1688 plug = container_of(cb, struct btrfs_plug_cb, cb);
1689 if (!plug->info) {
1690 plug->info = root->fs_info;
1691 INIT_LIST_HEAD(&plug->rbio_list);
1692 }
1693 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1694 } else {
1695 return __raid56_parity_write(rbio);
1696 }
1697 return 0;
1698}
1699
1700/*
1701 * all parity reconstruction happens here. We've read in everything
1702 * we can find from the drives and this does the heavy lifting of
1703 * sorting the good from the bad.
1704 */
1705static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1706{
1707 int pagenr, stripe;
1708 void **pointers;
1709 int faila = -1, failb = -1;
1710 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1711 struct page *page;
1712 int err;
1713 int i;
1714
1715 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1716 GFP_NOFS);
1717 if (!pointers) {
1718 err = -ENOMEM;
1719 goto cleanup_io;
1720 }
1721
1722 faila = rbio->faila;
1723 failb = rbio->failb;
1724
1725 if (rbio->read_rebuild) {
1726 spin_lock_irq(&rbio->bio_list_lock);
1727 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1728 spin_unlock_irq(&rbio->bio_list_lock);
1729 }
1730
1731 index_rbio_pages(rbio);
1732
1733 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1734 /* setup our array of pointers with pages
1735 * from each stripe
1736 */
1737 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1738 /*
1739 * if we're rebuilding a read, we have to use
1740 * pages from the bio list
1741 */
1742 if (rbio->read_rebuild &&
1743 (stripe == faila || stripe == failb)) {
1744 page = page_in_rbio(rbio, stripe, pagenr, 0);
1745 } else {
1746 page = rbio_stripe_page(rbio, stripe, pagenr);
1747 }
1748 pointers[stripe] = kmap(page);
1749 }
1750
1751 /* all raid6 handling here */
1752 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1753 RAID6_Q_STRIPE) {
1754
1755 /*
1756 * single failure, rebuild from parity raid5
1757 * style
1758 */
1759 if (failb < 0) {
1760 if (faila == rbio->nr_data) {
1761 /*
1762 * Just the P stripe has failed, without
1763 * a bad data or Q stripe.
1764 * TODO, we should redo the xor here.
1765 */
1766 err = -EIO;
1767 goto cleanup;
1768 }
1769 /*
1770 * a single failure in raid6 is rebuilt
1771 * in the pstripe code below
1772 */
1773 goto pstripe;
1774 }
1775
1776 /* make sure our ps and qs are in order */
1777 if (faila > failb) {
1778 int tmp = failb;
1779 failb = faila;
1780 faila = tmp;
1781 }
1782
1783 /* if the q stripe is failed, do a pstripe reconstruction
1784 * from the xors.
1785 * If both the q stripe and the P stripe are failed, we're
1786 * here due to a crc mismatch and we can't give them the
1787 * data they want
1788 */
1789 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1790 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1791 err = -EIO;
1792 goto cleanup;
1793 }
1794 /*
1795 * otherwise we have one bad data stripe and
1796 * a good P stripe. raid5!
1797 */
1798 goto pstripe;
1799 }
1800
1801 if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1802 raid6_datap_recov(rbio->bbio->num_stripes,
1803 PAGE_SIZE, faila, pointers);
1804 } else {
1805 raid6_2data_recov(rbio->bbio->num_stripes,
1806 PAGE_SIZE, faila, failb,
1807 pointers);
1808 }
1809 } else {
1810 void *p;
1811
1812 /* rebuild from P stripe here (raid5 or raid6) */
1813 BUG_ON(failb != -1);
1814pstripe:
1815 /* Copy parity block into failed block to start with */
1816 memcpy(pointers[faila],
1817 pointers[rbio->nr_data],
1818 PAGE_CACHE_SIZE);
1819
1820 /* rearrange the pointer array */
1821 p = pointers[faila];
1822 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1823 pointers[stripe] = pointers[stripe + 1];
1824 pointers[rbio->nr_data - 1] = p;
1825
1826 /* xor in the rest */
1827 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1828 }
1829 /* if we're doing this rebuild as part of an rmw, go through
1830 * and set all of our private rbio pages in the
1831 * failed stripes as uptodate. This way finish_rmw will
1832 * know they can be trusted. If this was a read reconstruction,
1833 * other endio functions will fiddle the uptodate bits
1834 */
1835 if (!rbio->read_rebuild) {
1836 for (i = 0; i < nr_pages; i++) {
1837 if (faila != -1) {
1838 page = rbio_stripe_page(rbio, faila, i);
1839 SetPageUptodate(page);
1840 }
1841 if (failb != -1) {
1842 page = rbio_stripe_page(rbio, failb, i);
1843 SetPageUptodate(page);
1844 }
1845 }
1846 }
1847 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1848 /*
1849 * if we're rebuilding a read, we have to use
1850 * pages from the bio list
1851 */
1852 if (rbio->read_rebuild &&
1853 (stripe == faila || stripe == failb)) {
1854 page = page_in_rbio(rbio, stripe, pagenr, 0);
1855 } else {
1856 page = rbio_stripe_page(rbio, stripe, pagenr);
1857 }
1858 kunmap(page);
1859 }
1860 }
1861
1862 err = 0;
1863cleanup:
1864 kfree(pointers);
1865
1866cleanup_io:
1867
1868 if (rbio->read_rebuild) {
1869 if (err == 0)
1870 cache_rbio_pages(rbio);
1871 else
1872 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1873
1874 rbio_orig_end_io(rbio, err, err == 0);
1875 } else if (err == 0) {
1876 rbio->faila = -1;
1877 rbio->failb = -1;
1878 finish_rmw(rbio);
1879 } else {
1880 rbio_orig_end_io(rbio, err, 0);
1881 }
1882}
1883
1884/*
1885 * This is called only for stripes we've read from disk to
1886 * reconstruct the parity.
1887 */
1888static void raid_recover_end_io(struct bio *bio, int err)
1889{
1890 struct btrfs_raid_bio *rbio = bio->bi_private;
1891
1892 /*
1893 * we only read stripe pages off the disk, set them
1894 * up to date if there were no errors
1895 */
1896 if (err)
1897 fail_bio_stripe(rbio, bio);
1898 else
1899 set_bio_pages_uptodate(bio);
1900 bio_put(bio);
1901
1902 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1903 return;
1904
1905 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1906 rbio_orig_end_io(rbio, -EIO, 0);
1907 else
1908 __raid_recover_end_io(rbio);
1909}
1910
1911/*
1912 * reads everything we need off the disk to reconstruct
1913 * the parity. endio handlers trigger final reconstruction
1914 * when the IO is done.
1915 *
1916 * This is used both for reads from the higher layers and for
1917 * parity construction required to finish a rmw cycle.
1918 */
1919static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1920{
1921 int bios_to_read = 0;
1922 struct btrfs_bio *bbio = rbio->bbio;
1923 struct bio_list bio_list;
1924 int ret;
1925 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1926 int pagenr;
1927 int stripe;
1928 struct bio *bio;
1929
1930 bio_list_init(&bio_list);
1931
1932 ret = alloc_rbio_pages(rbio);
1933 if (ret)
1934 goto cleanup;
1935
1936 atomic_set(&rbio->bbio->error, 0);
1937
1938 /*
1939 * read everything that hasn't failed. Thanks to the
1940 * stripe cache, it is possible that some or all of these
1941 * pages are going to be uptodate.
1942 */
1943 for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1944 if (rbio->faila == stripe ||
1945 rbio->failb == stripe)
1946 continue;
1947
1948 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1949 struct page *p;
1950
1951 /*
1952 * the rmw code may have already read this
1953 * page in
1954 */
1955 p = rbio_stripe_page(rbio, stripe, pagenr);
1956 if (PageUptodate(p))
1957 continue;
1958
1959 ret = rbio_add_io_page(rbio, &bio_list,
1960 rbio_stripe_page(rbio, stripe, pagenr),
1961 stripe, pagenr, rbio->stripe_len);
1962 if (ret < 0)
1963 goto cleanup;
1964 }
1965 }
1966
1967 bios_to_read = bio_list_size(&bio_list);
1968 if (!bios_to_read) {
1969 /*
1970 * we might have no bios to read just because the pages
1971 * were up to date, or we might have no bios to read because
1972 * the devices were gone.
1973 */
1974 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1975 __raid_recover_end_io(rbio);
1976 goto out;
1977 } else {
1978 goto cleanup;
1979 }
1980 }
1981
1982 /*
1983 * the bbio may be freed once we submit the last bio. Make sure
1984 * not to touch it after that
1985 */
1986 atomic_set(&bbio->stripes_pending, bios_to_read);
1987 while (1) {
1988 bio = bio_list_pop(&bio_list);
1989 if (!bio)
1990 break;
1991
1992 bio->bi_private = rbio;
1993 bio->bi_end_io = raid_recover_end_io;
1994
1995 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1996 BTRFS_WQ_ENDIO_RAID56);
1997
1998 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1999 submit_bio(READ, bio);
2000 }
2001out:
2002 return 0;
2003
2004cleanup:
2005 if (rbio->read_rebuild)
2006 rbio_orig_end_io(rbio, -EIO, 0);
2007 return -EIO;
2008}
2009
2010/*
2011 * the main entry point for reads from the higher layers. This
2012 * is really only called when the normal read path had a failure,
2013 * so we assume the bio they send down corresponds to a failed part
2014 * of the drive.
2015 */
2016int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2017 struct btrfs_bio *bbio, u64 *raid_map,
2018 u64 stripe_len, int mirror_num)
2019{
2020 struct btrfs_raid_bio *rbio;
2021 int ret;
2022
2023 rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2024 if (IS_ERR(rbio)) {
2025 return PTR_ERR(rbio);
2026 }
2027
2028 rbio->read_rebuild = 1;
2029 bio_list_add(&rbio->bio_list, bio);
2030 rbio->bio_list_bytes = bio->bi_size;
2031
2032 rbio->faila = find_logical_bio_stripe(rbio, bio);
2033 if (rbio->faila == -1) {
2034 BUG();
2035 kfree(rbio);
2036 return -EIO;
2037 }
2038
2039 /*
2040 * reconstruct from the q stripe if they are
2041 * asking for mirror 3
2042 */
2043 if (mirror_num == 3)
2044 rbio->failb = bbio->num_stripes - 2;
2045
2046 ret = lock_stripe_add(rbio);
2047
2048 /*
2049 * __raid56_parity_recover will end the bio with
2050 * any errors it hits. We don't want to return
2051 * its error value up the stack because our caller
2052 * will end up calling bio_endio with any nonzero
2053 * return
2054 */
2055 if (ret == 0)
2056 __raid56_parity_recover(rbio);
2057 /*
2058 * our rbio has been added to the list of
2059 * rbios that will be handled after the
2060 * currently lock owner is done
2061 */
2062 return 0;
2063
2064}
2065
2066static void rmw_work(struct btrfs_work *work)
2067{
2068 struct btrfs_raid_bio *rbio;
2069
2070 rbio = container_of(work, struct btrfs_raid_bio, work);
2071 raid56_rmw_stripe(rbio);
2072}
2073
2074static void read_rebuild_work(struct btrfs_work *work)
2075{
2076 struct btrfs_raid_bio *rbio;
2077
2078 rbio = container_of(work, struct btrfs_raid_bio, work);
2079 __raid56_parity_recover(rbio);
2080}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2012 Fusion-io All rights reserved.
3 * Copyright (C) 2012 Intel Corp. All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __BTRFS_RAID56__
21#define __BTRFS_RAID56__
22static inline int nr_parity_stripes(struct map_lookup *map)
23{
24 if (map->type & BTRFS_BLOCK_GROUP_RAID5)
25 return 1;
26 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
27 return 2;
28 else
29 return 0;
30}
31
32static inline int nr_data_stripes(struct map_lookup *map)
33{
34 return map->num_stripes - nr_parity_stripes(map);
35}
36#define RAID5_P_STRIPE ((u64)-2)
37#define RAID6_Q_STRIPE ((u64)-1)
38
39#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
40 ((x) == RAID6_Q_STRIPE))
41
42int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
43 struct btrfs_bio *bbio, u64 *raid_map,
44 u64 stripe_len, int mirror_num);
45int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
46 struct btrfs_bio *bbio, u64 *raid_map,
47 u64 stripe_len);
48
49int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
50void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
51#endif
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c78b2a3fc335..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
28#include "dev-replace.h" 28#include "dev-replace.h"
29#include "check-integrity.h" 29#include "check-integrity.h"
30#include "rcu-string.h" 30#include "rcu-string.h"
31#include "raid56.h"
31 32
32/* 33/*
33 * This is only the first step towards a full-features scrub. It reads all 34 * This is only the first step towards a full-features scrub. It reads all
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2254 struct btrfs_device *extent_dev; 2255 struct btrfs_device *extent_dev;
2255 int extent_mirror_num; 2256 int extent_mirror_num;
2256 2257
2258 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
2259 BTRFS_BLOCK_GROUP_RAID6)) {
2260 if (num >= nr_data_stripes(map)) {
2261 return 0;
2262 }
2263 }
2264
2257 nstripes = length; 2265 nstripes = length;
2258 offset = 0; 2266 offset = 0;
2259 do_div(nstripes, map->stripe_len); 2267 do_div(nstripes, map->stripe_len);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 955204ca0447..a83d486cc70c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -167,6 +167,9 @@ loop:
167 167
168 spin_lock_init(&cur_trans->commit_lock); 168 spin_lock_init(&cur_trans->commit_lock);
169 spin_lock_init(&cur_trans->delayed_refs.lock); 169 spin_lock_init(&cur_trans->delayed_refs.lock);
170 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
171 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
172 init_waitqueue_head(&cur_trans->delayed_refs.wait);
170 173
171 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 174 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
172 INIT_LIST_HEAD(&cur_trans->ordered_operations); 175 INIT_LIST_HEAD(&cur_trans->ordered_operations);
@@ -637,7 +640,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
637 if (!list_empty(&trans->new_bgs)) 640 if (!list_empty(&trans->new_bgs))
638 btrfs_create_pending_block_groups(trans, root); 641 btrfs_create_pending_block_groups(trans, root);
639 642
640 while (count < 2) { 643 while (count < 1) {
641 unsigned long cur = trans->delayed_ref_updates; 644 unsigned long cur = trans->delayed_ref_updates;
642 trans->delayed_ref_updates = 0; 645 trans->delayed_ref_updates = 0;
643 if (cur && 646 if (cur &&
@@ -649,6 +652,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
649 } 652 }
650 count++; 653 count++;
651 } 654 }
655
652 btrfs_trans_release_metadata(trans, root); 656 btrfs_trans_release_metadata(trans, root);
653 trans->block_rsv = NULL; 657 trans->block_rsv = NULL;
654 658
@@ -744,7 +748,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
744 struct extent_state *cached_state = NULL; 748 struct extent_state *cached_state = NULL;
745 u64 start = 0; 749 u64 start = 0;
746 u64 end; 750 u64 end;
751 struct blk_plug plug;
747 752
753 blk_start_plug(&plug);
748 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 754 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
749 mark, &cached_state)) { 755 mark, &cached_state)) {
750 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 756 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -758,6 +764,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
758 } 764 }
759 if (err) 765 if (err)
760 werr = err; 766 werr = err;
767 blk_finish_plug(&plug);
761 return werr; 768 return werr;
762} 769}
763 770
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 72b1cf1b2b5e..7992dc4ea4cc 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/ratelimit.h> 26#include <linux/ratelimit.h>
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/raid/pq.h>
29#include <asm/div64.h>
28#include "compat.h" 30#include "compat.h"
29#include "ctree.h" 31#include "ctree.h"
30#include "extent_map.h" 32#include "extent_map.h"
@@ -32,6 +34,7 @@
32#include "transaction.h" 34#include "transaction.h"
33#include "print-tree.h" 35#include "print-tree.h"
34#include "volumes.h" 36#include "volumes.h"
37#include "raid56.h"
35#include "async-thread.h" 38#include "async-thread.h"
36#include "check-integrity.h" 39#include "check-integrity.h"
37#include "rcu-string.h" 40#include "rcu-string.h"
@@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1465 goto out; 1468 goto out;
1466 } 1469 }
1467 1470
1471 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1472 root->fs_info->fs_devices->rw_devices <= 2) {
1473 printk(KERN_ERR "btrfs: unable to go below two "
1474 "devices on raid5\n");
1475 ret = -EINVAL;
1476 goto out;
1477 }
1478 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1479 root->fs_info->fs_devices->rw_devices <= 3) {
1480 printk(KERN_ERR "btrfs: unable to go below three "
1481 "devices on raid6\n");
1482 ret = -EINVAL;
1483 goto out;
1484 }
1485
1468 if (strcmp(device_path, "missing") == 0) { 1486 if (strcmp(device_path, "missing") == 0) {
1469 struct list_head *devices; 1487 struct list_head *devices;
1470 struct btrfs_device *tmp; 1488 struct btrfs_device *tmp;
@@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
2726 return 0; 2744 return 0;
2727 2745
2728 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | 2746 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2729 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) 2747 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
2730 factor = 2; 2748 factor = num_stripes / 2;
2731 else 2749 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
2732 factor = 1; 2750 factor = num_stripes - 1;
2733 factor = num_stripes / factor; 2751 } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
2752 factor = num_stripes - 2;
2753 } else {
2754 factor = num_stripes;
2755 }
2734 2756
2735 for (i = 0; i < num_stripes; i++) { 2757 for (i = 0; i < num_stripes; i++) {
2736 stripe = btrfs_stripe_nr(chunk, i); 2758 stripe = btrfs_stripe_nr(chunk, i);
@@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3090 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3112 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3091 else 3113 else
3092 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 3114 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
3093 BTRFS_BLOCK_GROUP_RAID10); 3115 BTRFS_BLOCK_GROUP_RAID10 |
3116 BTRFS_BLOCK_GROUP_RAID5 |
3117 BTRFS_BLOCK_GROUP_RAID6);
3094 3118
3095 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3119 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3096 (!alloc_profile_is_valid(bctl->data.target, 1) || 3120 (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3130 3154
3131 /* allow to reduce meta or sys integrity only if force set */ 3155 /* allow to reduce meta or sys integrity only if force set */
3132 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | 3156 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3133 BTRFS_BLOCK_GROUP_RAID10; 3157 BTRFS_BLOCK_GROUP_RAID10 |
3158 BTRFS_BLOCK_GROUP_RAID5 |
3159 BTRFS_BLOCK_GROUP_RAID6;
3134 do { 3160 do {
3135 seq = read_seqbegin(&fs_info->profiles_lock); 3161 seq = read_seqbegin(&fs_info->profiles_lock);
3136 3162
@@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3204 update_ioctl_balance_args(fs_info, 0, bargs); 3230 update_ioctl_balance_args(fs_info, 0, bargs);
3205 } 3231 }
3206 3232
3207 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3208 balance_need_close(fs_info)) {
3209 __cancel_balance(fs_info);
3210 }
3211
3212 wake_up(&fs_info->balance_wait_q); 3233 wake_up(&fs_info->balance_wait_q);
3213 3234
3214 return ret; 3235 return ret;
@@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
3611 .devs_increment = 1, 3632 .devs_increment = 1,
3612 .ncopies = 1, 3633 .ncopies = 1,
3613 }, 3634 },
3635 [BTRFS_RAID_RAID5] = {
3636 .sub_stripes = 1,
3637 .dev_stripes = 1,
3638 .devs_max = 0,
3639 .devs_min = 2,
3640 .devs_increment = 1,
3641 .ncopies = 2,
3642 },
3643 [BTRFS_RAID_RAID6] = {
3644 .sub_stripes = 1,
3645 .dev_stripes = 1,
3646 .devs_max = 0,
3647 .devs_min = 3,
3648 .devs_increment = 1,
3649 .ncopies = 3,
3650 },
3614}; 3651};
3615 3652
3653static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
3654{
3655 /* TODO allow them to set a preferred stripe size */
3656 return 64 * 1024;
3657}
3658
3659static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
3660{
3661 u64 features;
3662
3663 if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
3664 return;
3665
3666 features = btrfs_super_incompat_flags(info->super_copy);
3667 if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
3668 return;
3669
3670 features |= BTRFS_FEATURE_INCOMPAT_RAID56;
3671 btrfs_set_super_incompat_flags(info->super_copy, features);
3672 printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
3673}
3674
3616static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 3675static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *extent_root, 3676 struct btrfs_root *extent_root,
3618 struct map_lookup **map_ret, 3677 struct map_lookup **map_ret,
@@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3628 struct btrfs_device_info *devices_info = NULL; 3687 struct btrfs_device_info *devices_info = NULL;
3629 u64 total_avail; 3688 u64 total_avail;
3630 int num_stripes; /* total number of stripes to allocate */ 3689 int num_stripes; /* total number of stripes to allocate */
3690 int data_stripes; /* number of stripes that count for
3691 block group size */
3631 int sub_stripes; /* sub_stripes info for map */ 3692 int sub_stripes; /* sub_stripes info for map */
3632 int dev_stripes; /* stripes per dev */ 3693 int dev_stripes; /* stripes per dev */
3633 int devs_max; /* max devs to use */ 3694 int devs_max; /* max devs to use */
@@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3639 u64 max_chunk_size; 3700 u64 max_chunk_size;
3640 u64 stripe_size; 3701 u64 stripe_size;
3641 u64 num_bytes; 3702 u64 num_bytes;
3703 u64 raid_stripe_len = BTRFS_STRIPE_LEN;
3642 int ndevs; 3704 int ndevs;
3643 int i; 3705 int i;
3644 int j; 3706 int j;
@@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3768 stripe_size = devices_info[ndevs-1].max_avail; 3830 stripe_size = devices_info[ndevs-1].max_avail;
3769 num_stripes = ndevs * dev_stripes; 3831 num_stripes = ndevs * dev_stripes;
3770 3832
3833 /*
3834 * this will have to be fixed for RAID1 and RAID10 over
3835 * more drives
3836 */
3837 data_stripes = num_stripes / ncopies;
3838
3771 if (stripe_size * ndevs > max_chunk_size * ncopies) { 3839 if (stripe_size * ndevs > max_chunk_size * ncopies) {
3772 stripe_size = max_chunk_size * ncopies; 3840 stripe_size = max_chunk_size * ncopies;
3773 do_div(stripe_size, ndevs); 3841 do_div(stripe_size, ndevs);
3774 } 3842 }
3775 3843 if (type & BTRFS_BLOCK_GROUP_RAID5) {
3844 raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
3845 btrfs_super_stripesize(info->super_copy));
3846 data_stripes = num_stripes - 1;
3847 }
3848 if (type & BTRFS_BLOCK_GROUP_RAID6) {
3849 raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
3850 btrfs_super_stripesize(info->super_copy));
3851 data_stripes = num_stripes - 2;
3852 }
3776 do_div(stripe_size, dev_stripes); 3853 do_div(stripe_size, dev_stripes);
3777 3854
3778 /* align to BTRFS_STRIPE_LEN */ 3855 /* align to BTRFS_STRIPE_LEN */
3779 do_div(stripe_size, BTRFS_STRIPE_LEN); 3856 do_div(stripe_size, raid_stripe_len);
3780 stripe_size *= BTRFS_STRIPE_LEN; 3857 stripe_size *= raid_stripe_len;
3781 3858
3782 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 3859 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3783 if (!map) { 3860 if (!map) {
@@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3795 } 3872 }
3796 } 3873 }
3797 map->sector_size = extent_root->sectorsize; 3874 map->sector_size = extent_root->sectorsize;
3798 map->stripe_len = BTRFS_STRIPE_LEN; 3875 map->stripe_len = raid_stripe_len;
3799 map->io_align = BTRFS_STRIPE_LEN; 3876 map->io_align = raid_stripe_len;
3800 map->io_width = BTRFS_STRIPE_LEN; 3877 map->io_width = raid_stripe_len;
3801 map->type = type; 3878 map->type = type;
3802 map->sub_stripes = sub_stripes; 3879 map->sub_stripes = sub_stripes;
3803 3880
3804 *map_ret = map; 3881 *map_ret = map;
3805 num_bytes = stripe_size * (num_stripes / ncopies); 3882 num_bytes = stripe_size * data_stripes;
3806 3883
3807 *stripe_size_out = stripe_size; 3884 *stripe_size_out = stripe_size;
3808 *num_bytes_out = num_bytes; 3885 *num_bytes_out = num_bytes;
@@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3853 } 3930 }
3854 3931
3855 free_extent_map(em); 3932 free_extent_map(em);
3933 check_raid56_incompat_flag(extent_root->fs_info, type);
3934
3856 kfree(devices_info); 3935 kfree(devices_info);
3857 return 0; 3936 return 0;
3858 3937
@@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4136 ret = map->num_stripes; 4215 ret = map->num_stripes;
4137 else if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4216 else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4138 ret = map->sub_stripes; 4217 ret = map->sub_stripes;
4218 else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4219 ret = 2;
4220 else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4221 ret = 3;
4139 else 4222 else
4140 ret = 1; 4223 ret = 1;
4141 free_extent_map(em); 4224 free_extent_map(em);
@@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4148 return ret; 4231 return ret;
4149} 4232}
4150 4233
4234unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4235 struct btrfs_mapping_tree *map_tree,
4236 u64 logical)
4237{
4238 struct extent_map *em;
4239 struct map_lookup *map;
4240 struct extent_map_tree *em_tree = &map_tree->map_tree;
4241 unsigned long len = root->sectorsize;
4242
4243 read_lock(&em_tree->lock);
4244 em = lookup_extent_mapping(em_tree, logical, len);
4245 read_unlock(&em_tree->lock);
4246 BUG_ON(!em);
4247
4248 BUG_ON(em->start > logical || em->start + em->len < logical);
4249 map = (struct map_lookup *)em->bdev;
4250 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4251 BTRFS_BLOCK_GROUP_RAID6)) {
4252 len = map->stripe_len * nr_data_stripes(map);
4253 }
4254 free_extent_map(em);
4255 return len;
4256}
4257
4258int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4259 u64 logical, u64 len, int mirror_num)
4260{
4261 struct extent_map *em;
4262 struct map_lookup *map;
4263 struct extent_map_tree *em_tree = &map_tree->map_tree;
4264 int ret = 0;
4265
4266 read_lock(&em_tree->lock);
4267 em = lookup_extent_mapping(em_tree, logical, len);
4268 read_unlock(&em_tree->lock);
4269 BUG_ON(!em);
4270
4271 BUG_ON(em->start > logical || em->start + em->len < logical);
4272 map = (struct map_lookup *)em->bdev;
4273 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4274 BTRFS_BLOCK_GROUP_RAID6))
4275 ret = 1;
4276 free_extent_map(em);
4277 return ret;
4278}
4279
4151static int find_live_mirror(struct btrfs_fs_info *fs_info, 4280static int find_live_mirror(struct btrfs_fs_info *fs_info,
4152 struct map_lookup *map, int first, int num, 4281 struct map_lookup *map, int first, int num,
4153 int optimal, int dev_replace_is_ongoing) 4282 int optimal, int dev_replace_is_ongoing)
@@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
4185 return optimal; 4314 return optimal;
4186} 4315}
4187 4316
4317static inline int parity_smaller(u64 a, u64 b)
4318{
4319 return a > b;
4320}
4321
4322/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4323static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
4324{
4325 struct btrfs_bio_stripe s;
4326 int i;
4327 u64 l;
4328 int again = 1;
4329
4330 while (again) {
4331 again = 0;
4332 for (i = 0; i < bbio->num_stripes - 1; i++) {
4333 if (parity_smaller(raid_map[i], raid_map[i+1])) {
4334 s = bbio->stripes[i];
4335 l = raid_map[i];
4336 bbio->stripes[i] = bbio->stripes[i+1];
4337 raid_map[i] = raid_map[i+1];
4338 bbio->stripes[i+1] = s;
4339 raid_map[i+1] = l;
4340 again = 1;
4341 }
4342 }
4343 }
4344}
4345
4188static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, 4346static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4189 u64 logical, u64 *length, 4347 u64 logical, u64 *length,
4190 struct btrfs_bio **bbio_ret, 4348 struct btrfs_bio **bbio_ret,
4191 int mirror_num) 4349 int mirror_num, u64 **raid_map_ret)
4192{ 4350{
4193 struct extent_map *em; 4351 struct extent_map *em;
4194 struct map_lookup *map; 4352 struct map_lookup *map;
@@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4200 u64 stripe_nr; 4358 u64 stripe_nr;
4201 u64 stripe_nr_orig; 4359 u64 stripe_nr_orig;
4202 u64 stripe_nr_end; 4360 u64 stripe_nr_end;
4361 u64 stripe_len;
4362 u64 *raid_map = NULL;
4203 int stripe_index; 4363 int stripe_index;
4204 int i; 4364 int i;
4205 int ret = 0; 4365 int ret = 0;
@@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4211 int num_alloc_stripes; 4371 int num_alloc_stripes;
4212 int patch_the_first_stripe_for_dev_replace = 0; 4372 int patch_the_first_stripe_for_dev_replace = 0;
4213 u64 physical_to_patch_in_first_stripe = 0; 4373 u64 physical_to_patch_in_first_stripe = 0;
4374 u64 raid56_full_stripe_start = (u64)-1;
4214 4375
4215 read_lock(&em_tree->lock); 4376 read_lock(&em_tree->lock);
4216 em = lookup_extent_mapping(em_tree, logical, *length); 4377 em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4227 map = (struct map_lookup *)em->bdev; 4388 map = (struct map_lookup *)em->bdev;
4228 offset = logical - em->start; 4389 offset = logical - em->start;
4229 4390
4391 if (mirror_num > map->num_stripes)
4392 mirror_num = 0;
4393
4394 stripe_len = map->stripe_len;
4230 stripe_nr = offset; 4395 stripe_nr = offset;
4231 /* 4396 /*
4232 * stripe_nr counts the total number of stripes we have to stride 4397 * stripe_nr counts the total number of stripes we have to stride
4233 * to get to this block 4398 * to get to this block
4234 */ 4399 */
4235 do_div(stripe_nr, map->stripe_len); 4400 do_div(stripe_nr, stripe_len);
4236 4401
4237 stripe_offset = stripe_nr * map->stripe_len; 4402 stripe_offset = stripe_nr * stripe_len;
4238 BUG_ON(offset < stripe_offset); 4403 BUG_ON(offset < stripe_offset);
4239 4404
4240 /* stripe_offset is the offset of this block in its stripe*/ 4405 /* stripe_offset is the offset of this block in its stripe*/
4241 stripe_offset = offset - stripe_offset; 4406 stripe_offset = offset - stripe_offset;
4242 4407
4243 if (rw & REQ_DISCARD) 4408 /* if we're here for raid56, we need to know the stripe aligned start */
4409 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4410 unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
4411 raid56_full_stripe_start = offset;
4412
4413 /* allow a write of a full stripe, but make sure we don't
4414 * allow straddling of stripes
4415 */
4416 do_div(raid56_full_stripe_start, full_stripe_len);
4417 raid56_full_stripe_start *= full_stripe_len;
4418 }
4419
4420 if (rw & REQ_DISCARD) {
4421 /* we don't discard raid56 yet */
4422 if (map->type &
4423 (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
4424 ret = -EOPNOTSUPP;
4425 goto out;
4426 }
4244 *length = min_t(u64, em->len - offset, *length); 4427 *length = min_t(u64, em->len - offset, *length);
4245 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 4428 } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
4246 /* we limit the length of each bio to what fits in a stripe */ 4429 u64 max_len;
4247 *length = min_t(u64, em->len - offset, 4430 /* For writes to RAID[56], allow a full stripeset across all disks.
4248 map->stripe_len - stripe_offset); 4431 For other RAID types and for RAID[56] reads, just allow a single
4432 stripe (on a single disk). */
4433 if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
4434 (rw & REQ_WRITE)) {
4435 max_len = stripe_len * nr_data_stripes(map) -
4436 (offset - raid56_full_stripe_start);
4437 } else {
4438 /* we limit the length of each bio to what fits in a stripe */
4439 max_len = stripe_len - stripe_offset;
4440 }
4441 *length = min_t(u64, em->len - offset, max_len);
4249 } else { 4442 } else {
4250 *length = em->len - offset; 4443 *length = em->len - offset;
4251 } 4444 }
4252 4445
4446 /* This is for when we're called from btrfs_merge_bio_hook() and all
4447 it cares about is the length */
4253 if (!bbio_ret) 4448 if (!bbio_ret)
4254 goto out; 4449 goto out;
4255 4450
@@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4282 u64 physical_of_found = 0; 4477 u64 physical_of_found = 0;
4283 4478
4284 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, 4479 ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
4285 logical, &tmp_length, &tmp_bbio, 0); 4480 logical, &tmp_length, &tmp_bbio, 0, NULL);
4286 if (ret) { 4481 if (ret) {
4287 WARN_ON(tmp_bbio != NULL); 4482 WARN_ON(tmp_bbio != NULL);
4288 goto out; 4483 goto out;
@@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4348 do_div(stripe_nr_end, map->stripe_len); 4543 do_div(stripe_nr_end, map->stripe_len);
4349 stripe_end_offset = stripe_nr_end * map->stripe_len - 4544 stripe_end_offset = stripe_nr_end * map->stripe_len -
4350 (offset + *length); 4545 (offset + *length);
4546
4351 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4547 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4352 if (rw & REQ_DISCARD) 4548 if (rw & REQ_DISCARD)
4353 num_stripes = min_t(u64, map->num_stripes, 4549 num_stripes = min_t(u64, map->num_stripes,
@@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4398 dev_replace_is_ongoing); 4594 dev_replace_is_ongoing);
4399 mirror_num = stripe_index - old_stripe_index + 1; 4595 mirror_num = stripe_index - old_stripe_index + 1;
4400 } 4596 }
4597
4598 } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4599 BTRFS_BLOCK_GROUP_RAID6)) {
4600 u64 tmp;
4601
4602 if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
4603 && raid_map_ret) {
4604 int i, rot;
4605
4606 /* push stripe_nr back to the start of the full stripe */
4607 stripe_nr = raid56_full_stripe_start;
4608 do_div(stripe_nr, stripe_len);
4609
4610 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4611
4612 /* RAID[56] write or recovery. Return all stripes */
4613 num_stripes = map->num_stripes;
4614 max_errors = nr_parity_stripes(map);
4615
4616 raid_map = kmalloc(sizeof(u64) * num_stripes,
4617 GFP_NOFS);
4618 if (!raid_map) {
4619 ret = -ENOMEM;
4620 goto out;
4621 }
4622
4623 /* Work out the disk rotation on this stripe-set */
4624 tmp = stripe_nr;
4625 rot = do_div(tmp, num_stripes);
4626
4627 /* Fill in the logical address of each stripe */
4628 tmp = stripe_nr * nr_data_stripes(map);
4629 for (i = 0; i < nr_data_stripes(map); i++)
4630 raid_map[(i+rot) % num_stripes] =
4631 em->start + (tmp + i) * map->stripe_len;
4632
4633 raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
4634 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4635 raid_map[(i+rot+1) % num_stripes] =
4636 RAID6_Q_STRIPE;
4637
4638 *length = map->stripe_len;
4639 stripe_index = 0;
4640 stripe_offset = 0;
4641 } else {
4642 /*
4643 * Mirror #0 or #1 means the original data block.
4644 * Mirror #2 is RAID5 parity block.
4645 * Mirror #3 is RAID6 Q block.
4646 */
4647 stripe_index = do_div(stripe_nr, nr_data_stripes(map));
4648 if (mirror_num > 1)
4649 stripe_index = nr_data_stripes(map) +
4650 mirror_num - 2;
4651
4652 /* We distribute the parity blocks across stripes */
4653 tmp = stripe_nr + stripe_index;
4654 stripe_index = do_div(tmp, map->num_stripes);
4655 }
4401 } else { 4656 } else {
4402 /* 4657 /*
4403 * after this do_div call, stripe_nr is the number of stripes 4658 * after this do_div call, stripe_nr is the number of stripes
@@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4506 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 4761 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
4507 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 4762 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4508 BTRFS_BLOCK_GROUP_RAID10 | 4763 BTRFS_BLOCK_GROUP_RAID10 |
4764 BTRFS_BLOCK_GROUP_RAID5 |
4509 BTRFS_BLOCK_GROUP_DUP)) { 4765 BTRFS_BLOCK_GROUP_DUP)) {
4510 max_errors = 1; 4766 max_errors = 1;
4767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4768 max_errors = 2;
4511 } 4769 }
4512 } 4770 }
4513 4771
@@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4608 bbio->stripes[0].physical = physical_to_patch_in_first_stripe; 4866 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
4609 bbio->mirror_num = map->num_stripes + 1; 4867 bbio->mirror_num = map->num_stripes + 1;
4610 } 4868 }
4869 if (raid_map) {
4870 sort_parity_stripes(bbio, raid_map);
4871 *raid_map_ret = raid_map;
4872 }
4611out: 4873out:
4612 if (dev_replace_is_ongoing) 4874 if (dev_replace_is_ongoing)
4613 btrfs_dev_replace_unlock(dev_replace); 4875 btrfs_dev_replace_unlock(dev_replace);
@@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4620 struct btrfs_bio **bbio_ret, int mirror_num) 4882 struct btrfs_bio **bbio_ret, int mirror_num)
4621{ 4883{
4622 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, 4884 return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
4623 mirror_num); 4885 mirror_num, NULL);
4624} 4886}
4625 4887
4626int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 4888int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4634 u64 bytenr; 4896 u64 bytenr;
4635 u64 length; 4897 u64 length;
4636 u64 stripe_nr; 4898 u64 stripe_nr;
4899 u64 rmap_len;
4637 int i, j, nr = 0; 4900 int i, j, nr = 0;
4638 4901
4639 read_lock(&em_tree->lock); 4902 read_lock(&em_tree->lock);
@@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4644 map = (struct map_lookup *)em->bdev; 4907 map = (struct map_lookup *)em->bdev;
4645 4908
4646 length = em->len; 4909 length = em->len;
4910 rmap_len = map->stripe_len;
4911
4647 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 4912 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4648 do_div(length, map->num_stripes / map->sub_stripes); 4913 do_div(length, map->num_stripes / map->sub_stripes);
4649 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 4914 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
4650 do_div(length, map->num_stripes); 4915 do_div(length, map->num_stripes);
4916 else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
4917 BTRFS_BLOCK_GROUP_RAID6)) {
4918 do_div(length, nr_data_stripes(map));
4919 rmap_len = map->stripe_len * nr_data_stripes(map);
4920 }
4651 4921
4652 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 4922 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
4653 BUG_ON(!buf); /* -ENOMEM */ 4923 BUG_ON(!buf); /* -ENOMEM */
@@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4667 do_div(stripe_nr, map->sub_stripes); 4937 do_div(stripe_nr, map->sub_stripes);
4668 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 4938 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
4669 stripe_nr = stripe_nr * map->num_stripes + i; 4939 stripe_nr = stripe_nr * map->num_stripes + i;
4670 } 4940 } /* else if RAID[56], multiply by nr_data_stripes().
4671 bytenr = chunk_start + stripe_nr * map->stripe_len; 4941 * Alternatively, just use rmap_len below instead of
4942 * map->stripe_len */
4943
4944 bytenr = chunk_start + stripe_nr * rmap_len;
4672 WARN_ON(nr >= map->num_stripes); 4945 WARN_ON(nr >= map->num_stripes);
4673 for (j = 0; j < nr; j++) { 4946 for (j = 0; j < nr; j++) {
4674 if (buf[j] == bytenr) 4947 if (buf[j] == bytenr)
@@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
4682 4955
4683 *logical = buf; 4956 *logical = buf;
4684 *naddrs = nr; 4957 *naddrs = nr;
4685 *stripe_len = map->stripe_len; 4958 *stripe_len = rmap_len;
4686 4959
4687 free_extent_map(em); 4960 free_extent_map(em);
4688 return 0; 4961 return 0;
@@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
4756 bio->bi_bdev = (struct block_device *) 5029 bio->bi_bdev = (struct block_device *)
4757 (unsigned long)bbio->mirror_num; 5030 (unsigned long)bbio->mirror_num;
4758 /* only send an error to the higher layers if it is 5031 /* only send an error to the higher layers if it is
4759 * beyond the tolerance of the multi-bio 5032 * beyond the tolerance of the btrfs bio
4760 */ 5033 */
4761 if (atomic_read(&bbio->error) > bbio->max_errors) { 5034 if (atomic_read(&bbio->error) > bbio->max_errors) {
4762 err = -EIO; 5035 err = -EIO;
@@ -4790,13 +5063,18 @@ struct async_sched {
4790 * This will add one bio to the pending list for a device and make sure 5063 * This will add one bio to the pending list for a device and make sure
4791 * the work struct is scheduled. 5064 * the work struct is scheduled.
4792 */ 5065 */
4793static noinline void schedule_bio(struct btrfs_root *root, 5066noinline void btrfs_schedule_bio(struct btrfs_root *root,
4794 struct btrfs_device *device, 5067 struct btrfs_device *device,
4795 int rw, struct bio *bio) 5068 int rw, struct bio *bio)
4796{ 5069{
4797 int should_queue = 1; 5070 int should_queue = 1;
4798 struct btrfs_pending_bios *pending_bios; 5071 struct btrfs_pending_bios *pending_bios;
4799 5072
5073 if (device->missing || !device->bdev) {
5074 bio_endio(bio, -EIO);
5075 return;
5076 }
5077
4800 /* don't bother with additional async steps for reads, right now */ 5078 /* don't bother with additional async steps for reads, right now */
4801 if (!(rw & REQ_WRITE)) { 5079 if (!(rw & REQ_WRITE)) {
4802 bio_get(bio); 5080 bio_get(bio);
@@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
4894#endif 5172#endif
4895 bio->bi_bdev = dev->bdev; 5173 bio->bi_bdev = dev->bdev;
4896 if (async) 5174 if (async)
4897 schedule_bio(root, dev, rw, bio); 5175 btrfs_schedule_bio(root, dev, rw, bio);
4898 else 5176 else
4899 btrfsic_submit_bio(rw, bio); 5177 btrfsic_submit_bio(rw, bio);
4900} 5178}
@@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4953 u64 logical = (u64)bio->bi_sector << 9; 5231 u64 logical = (u64)bio->bi_sector << 9;
4954 u64 length = 0; 5232 u64 length = 0;
4955 u64 map_length; 5233 u64 map_length;
5234 u64 *raid_map = NULL;
4956 int ret; 5235 int ret;
4957 int dev_nr = 0; 5236 int dev_nr = 0;
4958 int total_devs = 1; 5237 int total_devs = 1;
@@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4961 length = bio->bi_size; 5240 length = bio->bi_size;
4962 map_length = length; 5241 map_length = length;
4963 5242
4964 ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, 5243 ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
4965 mirror_num); 5244 mirror_num, &raid_map);
4966 if (ret) 5245 if (ret) /* -ENOMEM */
4967 return ret; 5246 return ret;
4968 5247
4969 total_devs = bbio->num_stripes; 5248 total_devs = bbio->num_stripes;
5249 bbio->orig_bio = first_bio;
5250 bbio->private = first_bio->bi_private;
5251 bbio->end_io = first_bio->bi_end_io;
5252 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5253
5254 if (raid_map) {
5255 /* In this case, map_length has been set to the length of
5256 a single stripe; not the whole write */
5257 if (rw & WRITE) {
5258 return raid56_parity_write(root, bio, bbio,
5259 raid_map, map_length);
5260 } else {
5261 return raid56_parity_recover(root, bio, bbio,
5262 raid_map, map_length,
5263 mirror_num);
5264 }
5265 }
5266
4970 if (map_length < length) { 5267 if (map_length < length) {
4971 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " 5268 printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
4972 "len %llu\n", (unsigned long long)logical, 5269 "len %llu\n", (unsigned long long)logical,
@@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4975 BUG(); 5272 BUG();
4976 } 5273 }
4977 5274
4978 bbio->orig_bio = first_bio;
4979 bbio->private = first_bio->bi_private;
4980 bbio->end_io = first_bio->bi_end_io;
4981 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4982
4983 while (dev_nr < total_devs) { 5275 while (dev_nr < total_devs) {
4984 dev = bbio->stripes[dev_nr].dev; 5276 dev = bbio->stripes[dev_nr].dev;
4985 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5277 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 12bb84166a5f..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 321void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
322 struct btrfs_device *tgtdev); 322 struct btrfs_device *tgtdev);
323int btrfs_scratch_superblock(struct btrfs_device *device); 323int btrfs_scratch_superblock(struct btrfs_device *device);
324 324void btrfs_schedule_bio(struct btrfs_root *root,
325 struct btrfs_device *device,
326 int rw, struct bio *bio);
327int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
328 u64 logical, u64 len, int mirror_num);
329unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
330 struct btrfs_mapping_tree *map_tree,
331 u64 logical);
325static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 332static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
326 int index) 333 int index)
327{ 334{