diff options
author | Chris Mason <chris.mason@fusionio.com> | 2013-02-20 14:06:05 -0500 |
---|---|---|
committer | Chris Mason <chris.mason@fusionio.com> | 2013-02-20 14:06:05 -0500 |
commit | e942f883bc6651d50be139477baf6fb0eed3d5bb (patch) | |
tree | e1d19783e9c8b42198a69c17c9719fb90f302847 /fs/btrfs | |
parent | b2c6b3e0611c58fbeb6b9c0892b6249f7bdfaf6b (diff) | |
parent | 0e4e02636611dbf89a2f36320a32054f9936d6cb (diff) |
Merge branch 'raid56-experimental' into for-linus-3.9
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Conflicts:
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/inode.c
fs/btrfs/volumes.c
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/Kconfig | 3 | ||||
-rw-r--r-- | fs/btrfs/Makefile | 2 | ||||
-rw-r--r-- | fs/btrfs/compression.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 44 | ||||
-rw-r--r-- | fs/btrfs/delayed-ref.h | 9 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 62 | ||||
-rw-r--r-- | fs/btrfs/disk-io.h | 7 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 156 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 40 | ||||
-rw-r--r-- | fs/btrfs/extent_io.h | 2 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 50 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 18 | ||||
-rw-r--r-- | fs/btrfs/raid56.c | 2080 | ||||
-rw-r--r-- | fs/btrfs/raid56.h | 51 | ||||
-rw-r--r-- | fs/btrfs/scrub.c | 8 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 9 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 380 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 9 |
18 files changed, 2814 insertions, 120 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index d33f01c08b60..5f583c8a36d0 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
@@ -6,6 +6,9 @@ config BTRFS_FS | |||
6 | select ZLIB_DEFLATE | 6 | select ZLIB_DEFLATE |
7 | select LZO_COMPRESS | 7 | select LZO_COMPRESS |
8 | select LZO_DECOMPRESS | 8 | select LZO_DECOMPRESS |
9 | select RAID6_PQ | ||
10 | select XOR_BLOCKS | ||
11 | |||
9 | help | 12 | help |
10 | Btrfs is a new filesystem with extents, writable snapshotting, | 13 | Btrfs is a new filesystem with extents, writable snapshotting, |
11 | support for multiple devices and many more features. | 14 | support for multiple devices and many more features. |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o | 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o |
12 | 12 | ||
13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e3..15b94089abc4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
372 | page = compressed_pages[pg_index]; | 372 | page = compressed_pages[pg_index]; |
373 | page->mapping = inode->i_mapping; | 373 | page->mapping = inode->i_mapping; |
374 | if (bio->bi_size) | 374 | if (bio->bi_size) |
375 | ret = io_tree->ops->merge_bio_hook(page, 0, | 375 | ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, |
376 | PAGE_CACHE_SIZE, | 376 | PAGE_CACHE_SIZE, |
377 | bio, 0); | 377 | bio, 0); |
378 | else | 378 | else |
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
655 | page->index = em_start >> PAGE_CACHE_SHIFT; | 655 | page->index = em_start >> PAGE_CACHE_SHIFT; |
656 | 656 | ||
657 | if (comp_bio->bi_size) | 657 | if (comp_bio->bi_size) |
658 | ret = tree->ops->merge_bio_hook(page, 0, | 658 | ret = tree->ops->merge_bio_hook(READ, page, 0, |
659 | PAGE_CACHE_SIZE, | 659 | PAGE_CACHE_SIZE, |
660 | comp_bio, 0); | 660 | comp_bio, 0); |
661 | else | 661 | else |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1679051f4d39..3dcedfe4f759 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -506,6 +506,7 @@ struct btrfs_super_block { | |||
506 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) | 506 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) |
507 | 507 | ||
508 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) | 508 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) |
509 | #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) | ||
509 | 510 | ||
510 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL | 511 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
511 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL | 512 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL |
@@ -515,6 +516,7 @@ struct btrfs_super_block { | |||
515 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ | 516 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
516 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ | 517 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
517 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ | 518 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
519 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ | ||
518 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) | 520 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
519 | 521 | ||
520 | /* | 522 | /* |
@@ -956,6 +958,8 @@ struct btrfs_dev_replace_item { | |||
956 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) | 958 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) |
957 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) | 959 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) |
958 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) | 960 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) |
961 | #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) | ||
962 | #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) | ||
959 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE | 963 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE |
960 | 964 | ||
961 | enum btrfs_raid_types { | 965 | enum btrfs_raid_types { |
@@ -964,6 +968,8 @@ enum btrfs_raid_types { | |||
964 | BTRFS_RAID_DUP, | 968 | BTRFS_RAID_DUP, |
965 | BTRFS_RAID_RAID0, | 969 | BTRFS_RAID_RAID0, |
966 | BTRFS_RAID_SINGLE, | 970 | BTRFS_RAID_SINGLE, |
971 | BTRFS_RAID_RAID5, | ||
972 | BTRFS_RAID_RAID6, | ||
967 | BTRFS_NR_RAID_TYPES | 973 | BTRFS_NR_RAID_TYPES |
968 | }; | 974 | }; |
969 | 975 | ||
@@ -973,6 +979,8 @@ enum btrfs_raid_types { | |||
973 | 979 | ||
974 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ | 980 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ |
975 | BTRFS_BLOCK_GROUP_RAID1 | \ | 981 | BTRFS_BLOCK_GROUP_RAID1 | \ |
982 | BTRFS_BLOCK_GROUP_RAID5 | \ | ||
983 | BTRFS_BLOCK_GROUP_RAID6 | \ | ||
976 | BTRFS_BLOCK_GROUP_DUP | \ | 984 | BTRFS_BLOCK_GROUP_DUP | \ |
977 | BTRFS_BLOCK_GROUP_RAID10) | 985 | BTRFS_BLOCK_GROUP_RAID10) |
978 | /* | 986 | /* |
@@ -1197,6 +1205,10 @@ struct btrfs_block_group_cache { | |||
1197 | u64 flags; | 1205 | u64 flags; |
1198 | u64 sectorsize; | 1206 | u64 sectorsize; |
1199 | u64 cache_generation; | 1207 | u64 cache_generation; |
1208 | |||
1209 | /* for raid56, this is a full stripe, without parity */ | ||
1210 | unsigned long full_stripe_len; | ||
1211 | |||
1200 | unsigned int ro:1; | 1212 | unsigned int ro:1; |
1201 | unsigned int dirty:1; | 1213 | unsigned int dirty:1; |
1202 | unsigned int iref:1; | 1214 | unsigned int iref:1; |
@@ -1242,6 +1254,23 @@ enum btrfs_orphan_cleanup_state { | |||
1242 | ORPHAN_CLEANUP_DONE = 2, | 1254 | ORPHAN_CLEANUP_DONE = 2, |
1243 | }; | 1255 | }; |
1244 | 1256 | ||
1257 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1258 | struct btrfs_stripe_hash { | ||
1259 | struct list_head hash_list; | ||
1260 | wait_queue_head_t wait; | ||
1261 | spinlock_t lock; | ||
1262 | }; | ||
1263 | |||
1264 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
1265 | struct btrfs_stripe_hash_table { | ||
1266 | struct list_head stripe_cache; | ||
1267 | spinlock_t cache_lock; | ||
1268 | int cache_size; | ||
1269 | struct btrfs_stripe_hash table[]; | ||
1270 | }; | ||
1271 | |||
1272 | #define BTRFS_STRIPE_HASH_TABLE_BITS 11 | ||
1273 | |||
1245 | /* fs_info */ | 1274 | /* fs_info */ |
1246 | struct reloc_control; | 1275 | struct reloc_control; |
1247 | struct btrfs_device; | 1276 | struct btrfs_device; |
@@ -1341,6 +1370,13 @@ struct btrfs_fs_info { | |||
1341 | struct mutex cleaner_mutex; | 1370 | struct mutex cleaner_mutex; |
1342 | struct mutex chunk_mutex; | 1371 | struct mutex chunk_mutex; |
1343 | struct mutex volume_mutex; | 1372 | struct mutex volume_mutex; |
1373 | |||
1374 | /* this is used during read/modify/write to make sure | ||
1375 | * no two ios are trying to mod the same stripe at the same | ||
1376 | * time | ||
1377 | */ | ||
1378 | struct btrfs_stripe_hash_table *stripe_hash_table; | ||
1379 | |||
1344 | /* | 1380 | /* |
1345 | * this protects the ordered operations list only while we are | 1381 | * this protects the ordered operations list only while we are |
1346 | * processing all of the entries on it. This way we make | 1382 | * processing all of the entries on it. This way we make |
@@ -1423,6 +1459,8 @@ struct btrfs_fs_info { | |||
1423 | struct btrfs_workers flush_workers; | 1459 | struct btrfs_workers flush_workers; |
1424 | struct btrfs_workers endio_workers; | 1460 | struct btrfs_workers endio_workers; |
1425 | struct btrfs_workers endio_meta_workers; | 1461 | struct btrfs_workers endio_meta_workers; |
1462 | struct btrfs_workers endio_raid56_workers; | ||
1463 | struct btrfs_workers rmw_workers; | ||
1426 | struct btrfs_workers endio_meta_write_workers; | 1464 | struct btrfs_workers endio_meta_write_workers; |
1427 | struct btrfs_workers endio_write_workers; | 1465 | struct btrfs_workers endio_write_workers; |
1428 | struct btrfs_workers endio_freespace_worker; | 1466 | struct btrfs_workers endio_freespace_worker; |
@@ -3490,9 +3528,9 @@ int btrfs_writepages(struct address_space *mapping, | |||
3490 | struct writeback_control *wbc); | 3528 | struct writeback_control *wbc); |
3491 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | 3529 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, |
3492 | struct btrfs_root *new_root, u64 new_dirid); | 3530 | struct btrfs_root *new_root, u64 new_dirid); |
3493 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 3531 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
3494 | size_t size, struct bio *bio, unsigned long bio_flags); | 3532 | size_t size, struct bio *bio, |
3495 | 3533 | unsigned long bio_flags); | |
3496 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 3534 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
3497 | int btrfs_readpage(struct file *file, struct page *page); | 3535 | int btrfs_readpage(struct file *file, struct page *page); |
3498 | void btrfs_evict_inode(struct inode *inode); | 3536 | void btrfs_evict_inode(struct inode *inode); |
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 7939149f8f27..f75fcaf79aeb 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h | |||
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root { | |||
132 | unsigned long num_heads_ready; | 132 | unsigned long num_heads_ready; |
133 | 133 | ||
134 | /* | 134 | /* |
135 | * bumped when someone is making progress on the delayed | ||
136 | * refs, so that other procs know they are just adding to | ||
137 | * contention intead of helping | ||
138 | */ | ||
139 | atomic_t procs_running_refs; | ||
140 | atomic_t ref_seq; | ||
141 | wait_queue_head_t wait; | ||
142 | |||
143 | /* | ||
135 | * set when the tree is flushing before a transaction commit, | 144 | * set when the tree is flushing before a transaction commit, |
136 | * used by the throttling code to decide if new updates need | 145 | * used by the throttling code to decide if new updates need |
137 | * to be run right away | 146 | * to be run right away |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 779b401cd952..eb7c14308521 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include "check-integrity.h" | 46 | #include "check-integrity.h" |
47 | #include "rcu-string.h" | 47 | #include "rcu-string.h" |
48 | #include "dev-replace.h" | 48 | #include "dev-replace.h" |
49 | #include "raid56.h" | ||
49 | 50 | ||
50 | #ifdef CONFIG_X86 | 51 | #ifdef CONFIG_X86 |
51 | #include <asm/cpufeature.h> | 52 | #include <asm/cpufeature.h> |
@@ -640,8 +641,15 @@ err: | |||
640 | btree_readahead_hook(root, eb, eb->start, ret); | 641 | btree_readahead_hook(root, eb, eb->start, ret); |
641 | } | 642 | } |
642 | 643 | ||
643 | if (ret) | 644 | if (ret) { |
645 | /* | ||
646 | * our io error hook is going to dec the io pages | ||
647 | * again, we have to make sure it has something | ||
648 | * to decrement | ||
649 | */ | ||
650 | atomic_inc(&eb->io_pages); | ||
644 | clear_extent_buffer_uptodate(eb); | 651 | clear_extent_buffer_uptodate(eb); |
652 | } | ||
645 | free_extent_buffer(eb); | 653 | free_extent_buffer(eb); |
646 | out: | 654 | out: |
647 | return ret; | 655 | return ret; |
@@ -655,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) | |||
655 | eb = (struct extent_buffer *)page->private; | 663 | eb = (struct extent_buffer *)page->private; |
656 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); | 664 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); |
657 | eb->read_mirror = failed_mirror; | 665 | eb->read_mirror = failed_mirror; |
666 | atomic_dec(&eb->io_pages); | ||
658 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) | 667 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) |
659 | btree_readahead_hook(root, eb, eb->start, -EIO); | 668 | btree_readahead_hook(root, eb, eb->start, -EIO); |
660 | return -EIO; /* we fixed nothing */ | 669 | return -EIO; /* we fixed nothing */ |
@@ -671,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
671 | end_io_wq->work.flags = 0; | 680 | end_io_wq->work.flags = 0; |
672 | 681 | ||
673 | if (bio->bi_rw & REQ_WRITE) { | 682 | if (bio->bi_rw & REQ_WRITE) { |
674 | if (end_io_wq->metadata == 1) | 683 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) |
675 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, | 684 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, |
676 | &end_io_wq->work); | 685 | &end_io_wq->work); |
677 | else if (end_io_wq->metadata == 2) | 686 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) |
678 | btrfs_queue_worker(&fs_info->endio_freespace_worker, | 687 | btrfs_queue_worker(&fs_info->endio_freespace_worker, |
679 | &end_io_wq->work); | 688 | &end_io_wq->work); |
689 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) | ||
690 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
691 | &end_io_wq->work); | ||
680 | else | 692 | else |
681 | btrfs_queue_worker(&fs_info->endio_write_workers, | 693 | btrfs_queue_worker(&fs_info->endio_write_workers, |
682 | &end_io_wq->work); | 694 | &end_io_wq->work); |
683 | } else { | 695 | } else { |
684 | if (end_io_wq->metadata) | 696 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) |
697 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
698 | &end_io_wq->work); | ||
699 | else if (end_io_wq->metadata) | ||
685 | btrfs_queue_worker(&fs_info->endio_meta_workers, | 700 | btrfs_queue_worker(&fs_info->endio_meta_workers, |
686 | &end_io_wq->work); | 701 | &end_io_wq->work); |
687 | else | 702 | else |
@@ -696,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
696 | * 0 - if data | 711 | * 0 - if data |
697 | * 1 - if normal metadta | 712 | * 1 - if normal metadta |
698 | * 2 - if writing to the free space cache area | 713 | * 2 - if writing to the free space cache area |
714 | * 3 - raid parity work | ||
699 | */ | 715 | */ |
700 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 716 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
701 | int metadata) | 717 | int metadata) |
@@ -2179,6 +2195,12 @@ int open_ctree(struct super_block *sb, | |||
2179 | init_waitqueue_head(&fs_info->transaction_blocked_wait); | 2195 | init_waitqueue_head(&fs_info->transaction_blocked_wait); |
2180 | init_waitqueue_head(&fs_info->async_submit_wait); | 2196 | init_waitqueue_head(&fs_info->async_submit_wait); |
2181 | 2197 | ||
2198 | ret = btrfs_alloc_stripe_hash_table(fs_info); | ||
2199 | if (ret) { | ||
2200 | err = -ENOMEM; | ||
2201 | goto fail_alloc; | ||
2202 | } | ||
2203 | |||
2182 | __setup_root(4096, 4096, 4096, 4096, tree_root, | 2204 | __setup_root(4096, 4096, 4096, 4096, tree_root, |
2183 | fs_info, BTRFS_ROOT_TREE_OBJECTID); | 2205 | fs_info, BTRFS_ROOT_TREE_OBJECTID); |
2184 | 2206 | ||
@@ -2349,6 +2371,12 @@ int open_ctree(struct super_block *sb, | |||
2349 | btrfs_init_workers(&fs_info->endio_meta_write_workers, | 2371 | btrfs_init_workers(&fs_info->endio_meta_write_workers, |
2350 | "endio-meta-write", fs_info->thread_pool_size, | 2372 | "endio-meta-write", fs_info->thread_pool_size, |
2351 | &fs_info->generic_worker); | 2373 | &fs_info->generic_worker); |
2374 | btrfs_init_workers(&fs_info->endio_raid56_workers, | ||
2375 | "endio-raid56", fs_info->thread_pool_size, | ||
2376 | &fs_info->generic_worker); | ||
2377 | btrfs_init_workers(&fs_info->rmw_workers, | ||
2378 | "rmw", fs_info->thread_pool_size, | ||
2379 | &fs_info->generic_worker); | ||
2352 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | 2380 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", |
2353 | fs_info->thread_pool_size, | 2381 | fs_info->thread_pool_size, |
2354 | &fs_info->generic_worker); | 2382 | &fs_info->generic_worker); |
@@ -2367,6 +2395,8 @@ int open_ctree(struct super_block *sb, | |||
2367 | */ | 2395 | */ |
2368 | fs_info->endio_workers.idle_thresh = 4; | 2396 | fs_info->endio_workers.idle_thresh = 4; |
2369 | fs_info->endio_meta_workers.idle_thresh = 4; | 2397 | fs_info->endio_meta_workers.idle_thresh = 4; |
2398 | fs_info->endio_raid56_workers.idle_thresh = 4; | ||
2399 | fs_info->rmw_workers.idle_thresh = 2; | ||
2370 | 2400 | ||
2371 | fs_info->endio_write_workers.idle_thresh = 2; | 2401 | fs_info->endio_write_workers.idle_thresh = 2; |
2372 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2402 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
@@ -2383,6 +2413,8 @@ int open_ctree(struct super_block *sb, | |||
2383 | ret |= btrfs_start_workers(&fs_info->fixup_workers); | 2413 | ret |= btrfs_start_workers(&fs_info->fixup_workers); |
2384 | ret |= btrfs_start_workers(&fs_info->endio_workers); | 2414 | ret |= btrfs_start_workers(&fs_info->endio_workers); |
2385 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); | 2415 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); |
2416 | ret |= btrfs_start_workers(&fs_info->rmw_workers); | ||
2417 | ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); | ||
2386 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); | 2418 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); |
2387 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); | 2419 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); |
2388 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); | 2420 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); |
@@ -2726,6 +2758,8 @@ fail_sb_buffer: | |||
2726 | btrfs_stop_workers(&fs_info->workers); | 2758 | btrfs_stop_workers(&fs_info->workers); |
2727 | btrfs_stop_workers(&fs_info->endio_workers); | 2759 | btrfs_stop_workers(&fs_info->endio_workers); |
2728 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 2760 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
2761 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
2762 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
2729 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 2763 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
2730 | btrfs_stop_workers(&fs_info->endio_write_workers); | 2764 | btrfs_stop_workers(&fs_info->endio_write_workers); |
2731 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 2765 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -2747,6 +2781,7 @@ fail_bdi: | |||
2747 | fail_srcu: | 2781 | fail_srcu: |
2748 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2782 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
2749 | fail: | 2783 | fail: |
2784 | btrfs_free_stripe_hash_table(fs_info); | ||
2750 | btrfs_close_devices(fs_info->fs_devices); | 2785 | btrfs_close_devices(fs_info->fs_devices); |
2751 | return err; | 2786 | return err; |
2752 | 2787 | ||
@@ -3094,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( | |||
3094 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) | 3129 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) |
3095 | == 0))) | 3130 | == 0))) |
3096 | num_tolerated_disk_barrier_failures = 0; | 3131 | num_tolerated_disk_barrier_failures = 0; |
3097 | else if (num_tolerated_disk_barrier_failures > 1 | 3132 | else if (num_tolerated_disk_barrier_failures > 1) { |
3098 | && | 3133 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | |
3099 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3134 | BTRFS_BLOCK_GROUP_RAID5 | |
3100 | BTRFS_BLOCK_GROUP_RAID10))) | 3135 | BTRFS_BLOCK_GROUP_RAID10)) { |
3101 | num_tolerated_disk_barrier_failures = 1; | 3136 | num_tolerated_disk_barrier_failures = 1; |
3137 | } else if (flags & | ||
3138 | BTRFS_BLOCK_GROUP_RAID5) { | ||
3139 | num_tolerated_disk_barrier_failures = 2; | ||
3140 | } | ||
3141 | } | ||
3102 | } | 3142 | } |
3103 | } | 3143 | } |
3104 | up_read(&sinfo->groups_sem); | 3144 | up_read(&sinfo->groups_sem); |
@@ -3402,6 +3442,8 @@ int close_ctree(struct btrfs_root *root) | |||
3402 | btrfs_stop_workers(&fs_info->workers); | 3442 | btrfs_stop_workers(&fs_info->workers); |
3403 | btrfs_stop_workers(&fs_info->endio_workers); | 3443 | btrfs_stop_workers(&fs_info->endio_workers); |
3404 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 3444 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
3445 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
3446 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
3405 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 3447 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
3406 | btrfs_stop_workers(&fs_info->endio_write_workers); | 3448 | btrfs_stop_workers(&fs_info->endio_write_workers); |
3407 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 3449 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
@@ -3424,6 +3466,8 @@ int close_ctree(struct btrfs_root *root) | |||
3424 | bdi_destroy(&fs_info->bdi); | 3466 | bdi_destroy(&fs_info->bdi); |
3425 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3467 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
3426 | 3468 | ||
3469 | btrfs_free_stripe_hash_table(fs_info); | ||
3470 | |||
3427 | return 0; | 3471 | return 0; |
3428 | } | 3472 | } |
3429 | 3473 | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
@@ -25,6 +25,13 @@ | |||
25 | #define BTRFS_SUPER_MIRROR_MAX 3 | 25 | #define BTRFS_SUPER_MIRROR_MAX 3 |
26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 | 26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 |
27 | 27 | ||
28 | enum { | ||
29 | BTRFS_WQ_ENDIO_DATA = 0, | ||
30 | BTRFS_WQ_ENDIO_METADATA = 1, | ||
31 | BTRFS_WQ_ENDIO_FREE_SPACE = 2, | ||
32 | BTRFS_WQ_ENDIO_RAID56 = 3, | ||
33 | }; | ||
34 | |||
28 | static inline u64 btrfs_sb_offset(int mirror) | 35 | static inline u64 btrfs_sb_offset(int mirror) |
29 | { | 36 | { |
30 | u64 start = 16 * 1024; | 37 | u64 start = 16 * 1024; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5cd44e239595..b3ecca447ddf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include "print-tree.h" | 31 | #include "print-tree.h" |
32 | #include "transaction.h" | 32 | #include "transaction.h" |
33 | #include "volumes.h" | 33 | #include "volumes.h" |
34 | #include "raid56.h" | ||
34 | #include "locking.h" | 35 | #include "locking.h" |
35 | #include "free-space-cache.h" | 36 | #include "free-space-cache.h" |
36 | #include "math.h" | 37 | #include "math.h" |
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
1852 | *actual_bytes = discarded_bytes; | 1853 | *actual_bytes = discarded_bytes; |
1853 | 1854 | ||
1854 | 1855 | ||
1856 | if (ret == -EOPNOTSUPP) | ||
1857 | ret = 0; | ||
1855 | return ret; | 1858 | return ret; |
1856 | } | 1859 | } |
1857 | 1860 | ||
@@ -2440,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, | |||
2440 | return ret; | 2443 | return ret; |
2441 | } | 2444 | } |
2442 | 2445 | ||
2446 | static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, | ||
2447 | int count) | ||
2448 | { | ||
2449 | int val = atomic_read(&delayed_refs->ref_seq); | ||
2450 | |||
2451 | if (val < seq || val >= seq + count) | ||
2452 | return 1; | ||
2453 | return 0; | ||
2454 | } | ||
2455 | |||
2443 | /* | 2456 | /* |
2444 | * this starts processing the delayed reference count updates and | 2457 | * this starts processing the delayed reference count updates and |
2445 | * extent insertions we have queued up so far. count can be | 2458 | * extent insertions we have queued up so far. count can be |
@@ -2474,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | |||
2474 | 2487 | ||
2475 | delayed_refs = &trans->transaction->delayed_refs; | 2488 | delayed_refs = &trans->transaction->delayed_refs; |
2476 | INIT_LIST_HEAD(&cluster); | 2489 | INIT_LIST_HEAD(&cluster); |
2490 | if (count == 0) { | ||
2491 | count = delayed_refs->num_entries * 2; | ||
2492 | run_most = 1; | ||
2493 | } | ||
2494 | |||
2495 | if (!run_all && !run_most) { | ||
2496 | int old; | ||
2497 | int seq = atomic_read(&delayed_refs->ref_seq); | ||
2498 | |||
2499 | progress: | ||
2500 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
2501 | if (old) { | ||
2502 | DEFINE_WAIT(__wait); | ||
2503 | if (delayed_refs->num_entries < 16348) | ||
2504 | return 0; | ||
2505 | |||
2506 | prepare_to_wait(&delayed_refs->wait, &__wait, | ||
2507 | TASK_UNINTERRUPTIBLE); | ||
2508 | |||
2509 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
2510 | if (old) { | ||
2511 | schedule(); | ||
2512 | finish_wait(&delayed_refs->wait, &__wait); | ||
2513 | |||
2514 | if (!refs_newer(delayed_refs, seq, 256)) | ||
2515 | goto progress; | ||
2516 | else | ||
2517 | return 0; | ||
2518 | } else { | ||
2519 | finish_wait(&delayed_refs->wait, &__wait); | ||
2520 | goto again; | ||
2521 | } | ||
2522 | } | ||
2523 | |||
2524 | } else { | ||
2525 | atomic_inc(&delayed_refs->procs_running_refs); | ||
2526 | } | ||
2527 | |||
2477 | again: | 2528 | again: |
2478 | loops = 0; | 2529 | loops = 0; |
2479 | spin_lock(&delayed_refs->lock); | 2530 | spin_lock(&delayed_refs->lock); |
@@ -2482,10 +2533,6 @@ again: | |||
2482 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); | 2533 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); |
2483 | #endif | 2534 | #endif |
2484 | 2535 | ||
2485 | if (count == 0) { | ||
2486 | count = delayed_refs->num_entries * 2; | ||
2487 | run_most = 1; | ||
2488 | } | ||
2489 | while (1) { | 2536 | while (1) { |
2490 | if (!(run_all || run_most) && | 2537 | if (!(run_all || run_most) && |
2491 | delayed_refs->num_heads_ready < 64) | 2538 | delayed_refs->num_heads_ready < 64) |
@@ -2508,9 +2555,12 @@ again: | |||
2508 | btrfs_release_ref_cluster(&cluster); | 2555 | btrfs_release_ref_cluster(&cluster); |
2509 | spin_unlock(&delayed_refs->lock); | 2556 | spin_unlock(&delayed_refs->lock); |
2510 | btrfs_abort_transaction(trans, root, ret); | 2557 | btrfs_abort_transaction(trans, root, ret); |
2558 | atomic_dec(&delayed_refs->procs_running_refs); | ||
2511 | return ret; | 2559 | return ret; |
2512 | } | 2560 | } |
2513 | 2561 | ||
2562 | atomic_add(ret, &delayed_refs->ref_seq); | ||
2563 | |||
2514 | count -= min_t(unsigned long, ret, count); | 2564 | count -= min_t(unsigned long, ret, count); |
2515 | 2565 | ||
2516 | if (count == 0) | 2566 | if (count == 0) |
@@ -2579,6 +2629,11 @@ again: | |||
2579 | goto again; | 2629 | goto again; |
2580 | } | 2630 | } |
2581 | out: | 2631 | out: |
2632 | atomic_dec(&delayed_refs->procs_running_refs); | ||
2633 | smp_mb(); | ||
2634 | if (waitqueue_active(&delayed_refs->wait)) | ||
2635 | wake_up(&delayed_refs->wait); | ||
2636 | |||
2582 | spin_unlock(&delayed_refs->lock); | 2637 | spin_unlock(&delayed_refs->lock); |
2583 | assert_qgroups_uptodate(trans); | 2638 | assert_qgroups_uptodate(trans); |
2584 | return 0; | 2639 | return 0; |
@@ -3284,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3284 | u64 num_devices = root->fs_info->fs_devices->rw_devices + | 3339 | u64 num_devices = root->fs_info->fs_devices->rw_devices + |
3285 | root->fs_info->fs_devices->missing_devices; | 3340 | root->fs_info->fs_devices->missing_devices; |
3286 | u64 target; | 3341 | u64 target; |
3342 | u64 tmp; | ||
3287 | 3343 | ||
3288 | /* | 3344 | /* |
3289 | * see if restripe for this chunk_type is in progress, if so | 3345 | * see if restripe for this chunk_type is in progress, if so |
@@ -3300,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3300 | } | 3356 | } |
3301 | spin_unlock(&root->fs_info->balance_lock); | 3357 | spin_unlock(&root->fs_info->balance_lock); |
3302 | 3358 | ||
3359 | /* First, mask out the RAID levels which aren't possible */ | ||
3303 | if (num_devices == 1) | 3360 | if (num_devices == 1) |
3304 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | 3361 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | |
3362 | BTRFS_BLOCK_GROUP_RAID5); | ||
3363 | if (num_devices < 3) | ||
3364 | flags &= ~BTRFS_BLOCK_GROUP_RAID6; | ||
3305 | if (num_devices < 4) | 3365 | if (num_devices < 4) |
3306 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; | 3366 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; |
3307 | 3367 | ||
3308 | if ((flags & BTRFS_BLOCK_GROUP_DUP) && | 3368 | tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | |
3309 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3369 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | |
3310 | BTRFS_BLOCK_GROUP_RAID10))) { | 3370 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); |
3311 | flags &= ~BTRFS_BLOCK_GROUP_DUP; | 3371 | flags &= ~tmp; |
3312 | } | ||
3313 | 3372 | ||
3314 | if ((flags & BTRFS_BLOCK_GROUP_RAID1) && | 3373 | if (tmp & BTRFS_BLOCK_GROUP_RAID6) |
3315 | (flags & BTRFS_BLOCK_GROUP_RAID10)) { | 3374 | tmp = BTRFS_BLOCK_GROUP_RAID6; |
3316 | flags &= ~BTRFS_BLOCK_GROUP_RAID1; | 3375 | else if (tmp & BTRFS_BLOCK_GROUP_RAID5) |
3317 | } | 3376 | tmp = BTRFS_BLOCK_GROUP_RAID5; |
3318 | 3377 | else if (tmp & BTRFS_BLOCK_GROUP_RAID10) | |
3319 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | 3378 | tmp = BTRFS_BLOCK_GROUP_RAID10; |
3320 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | 3379 | else if (tmp & BTRFS_BLOCK_GROUP_RAID1) |
3321 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | 3380 | tmp = BTRFS_BLOCK_GROUP_RAID1; |
3322 | (flags & BTRFS_BLOCK_GROUP_DUP))) { | 3381 | else if (tmp & BTRFS_BLOCK_GROUP_RAID0) |
3323 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | 3382 | tmp = BTRFS_BLOCK_GROUP_RAID0; |
3324 | } | ||
3325 | 3383 | ||
3326 | return extended_to_chunk(flags); | 3384 | return extended_to_chunk(flags | tmp); |
3327 | } | 3385 | } |
3328 | 3386 | ||
3329 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | 3387 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) |
@@ -3347,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | |||
3347 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | 3405 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) |
3348 | { | 3406 | { |
3349 | u64 flags; | 3407 | u64 flags; |
3408 | u64 ret; | ||
3350 | 3409 | ||
3351 | if (data) | 3410 | if (data) |
3352 | flags = BTRFS_BLOCK_GROUP_DATA; | 3411 | flags = BTRFS_BLOCK_GROUP_DATA; |
@@ -3355,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | |||
3355 | else | 3414 | else |
3356 | flags = BTRFS_BLOCK_GROUP_METADATA; | 3415 | flags = BTRFS_BLOCK_GROUP_METADATA; |
3357 | 3416 | ||
3358 | return get_alloc_profile(root, flags); | 3417 | ret = get_alloc_profile(root, flags); |
3418 | return ret; | ||
3359 | } | 3419 | } |
3360 | 3420 | ||
3361 | /* | 3421 | /* |
@@ -3530,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) | |||
3530 | { | 3590 | { |
3531 | u64 num_dev; | 3591 | u64 num_dev; |
3532 | 3592 | ||
3533 | if (type & BTRFS_BLOCK_GROUP_RAID10 || | 3593 | if (type & (BTRFS_BLOCK_GROUP_RAID10 | |
3534 | type & BTRFS_BLOCK_GROUP_RAID0) | 3594 | BTRFS_BLOCK_GROUP_RAID0 | |
3595 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3596 | BTRFS_BLOCK_GROUP_RAID6)) | ||
3535 | num_dev = root->fs_info->fs_devices->rw_devices; | 3597 | num_dev = root->fs_info->fs_devices->rw_devices; |
3536 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | 3598 | else if (type & BTRFS_BLOCK_GROUP_RAID1) |
3537 | num_dev = 2; | 3599 | num_dev = 2; |
@@ -3706,7 +3768,9 @@ static int can_overcommit(struct btrfs_root *root, | |||
3706 | 3768 | ||
3707 | /* | 3769 | /* |
3708 | * If we have dup, raid1 or raid10 then only half of the free | 3770 | * If we have dup, raid1 or raid10 then only half of the free |
3709 | * space is actually useable. | 3771 | * space is actually useable. For raid56, the space info used |
3772 | * doesn't include the parity drive, so we don't have to | ||
3773 | * change the math | ||
3710 | */ | 3774 | */ |
3711 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | 3775 | if (profile & (BTRFS_BLOCK_GROUP_DUP | |
3712 | BTRFS_BLOCK_GROUP_RAID1 | | 3776 | BTRFS_BLOCK_GROUP_RAID1 | |
@@ -5539,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
5539 | return ret; | 5603 | return ret; |
5540 | } | 5604 | } |
5541 | 5605 | ||
5542 | static u64 stripe_align(struct btrfs_root *root, u64 val) | 5606 | static u64 stripe_align(struct btrfs_root *root, |
5607 | struct btrfs_block_group_cache *cache, | ||
5608 | u64 val, u64 num_bytes) | ||
5543 | { | 5609 | { |
5544 | u64 mask = ((u64)root->stripesize - 1); | 5610 | u64 mask; |
5545 | u64 ret = (val + mask) & ~mask; | 5611 | u64 ret; |
5612 | mask = ((u64)root->stripesize - 1); | ||
5613 | ret = (val + mask) & ~mask; | ||
5546 | return ret; | 5614 | return ret; |
5547 | } | 5615 | } |
5548 | 5616 | ||
@@ -5599,8 +5667,12 @@ int __get_raid_index(u64 flags) | |||
5599 | return BTRFS_RAID_DUP; | 5667 | return BTRFS_RAID_DUP; |
5600 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) | 5668 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) |
5601 | return BTRFS_RAID_RAID0; | 5669 | return BTRFS_RAID_RAID0; |
5602 | else | 5670 | else if (flags & BTRFS_BLOCK_GROUP_RAID5) |
5603 | return BTRFS_RAID_SINGLE; | 5671 | return BTRFS_RAID_RAID5; |
5672 | else if (flags & BTRFS_BLOCK_GROUP_RAID6) | ||
5673 | return BTRFS_RAID_RAID6; | ||
5674 | |||
5675 | return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ | ||
5604 | } | 5676 | } |
5605 | 5677 | ||
5606 | static int get_block_group_index(struct btrfs_block_group_cache *cache) | 5678 | static int get_block_group_index(struct btrfs_block_group_cache *cache) |
@@ -5743,6 +5815,8 @@ search: | |||
5743 | if (!block_group_bits(block_group, data)) { | 5815 | if (!block_group_bits(block_group, data)) { |
5744 | u64 extra = BTRFS_BLOCK_GROUP_DUP | | 5816 | u64 extra = BTRFS_BLOCK_GROUP_DUP | |
5745 | BTRFS_BLOCK_GROUP_RAID1 | | 5817 | BTRFS_BLOCK_GROUP_RAID1 | |
5818 | BTRFS_BLOCK_GROUP_RAID5 | | ||
5819 | BTRFS_BLOCK_GROUP_RAID6 | | ||
5746 | BTRFS_BLOCK_GROUP_RAID10; | 5820 | BTRFS_BLOCK_GROUP_RAID10; |
5747 | 5821 | ||
5748 | /* | 5822 | /* |
@@ -5771,6 +5845,7 @@ have_block_group: | |||
5771 | * lets look there | 5845 | * lets look there |
5772 | */ | 5846 | */ |
5773 | if (last_ptr) { | 5847 | if (last_ptr) { |
5848 | unsigned long aligned_cluster; | ||
5774 | /* | 5849 | /* |
5775 | * the refill lock keeps out other | 5850 | * the refill lock keeps out other |
5776 | * people trying to start a new cluster | 5851 | * people trying to start a new cluster |
@@ -5837,11 +5912,15 @@ refill_cluster: | |||
5837 | goto unclustered_alloc; | 5912 | goto unclustered_alloc; |
5838 | } | 5913 | } |
5839 | 5914 | ||
5915 | aligned_cluster = max_t(unsigned long, | ||
5916 | empty_cluster + empty_size, | ||
5917 | block_group->full_stripe_len); | ||
5918 | |||
5840 | /* allocate a cluster in this block group */ | 5919 | /* allocate a cluster in this block group */ |
5841 | ret = btrfs_find_space_cluster(trans, root, | 5920 | ret = btrfs_find_space_cluster(trans, root, |
5842 | block_group, last_ptr, | 5921 | block_group, last_ptr, |
5843 | search_start, num_bytes, | 5922 | search_start, num_bytes, |
5844 | empty_cluster + empty_size); | 5923 | aligned_cluster); |
5845 | if (ret == 0) { | 5924 | if (ret == 0) { |
5846 | /* | 5925 | /* |
5847 | * now pull our allocation out of this | 5926 | * now pull our allocation out of this |
@@ -5912,7 +5991,8 @@ unclustered_alloc: | |||
5912 | goto loop; | 5991 | goto loop; |
5913 | } | 5992 | } |
5914 | checks: | 5993 | checks: |
5915 | search_start = stripe_align(root, offset); | 5994 | search_start = stripe_align(root, used_block_group, |
5995 | offset, num_bytes); | ||
5916 | 5996 | ||
5917 | /* move on to the next group */ | 5997 | /* move on to the next group */ |
5918 | if (search_start + num_bytes > | 5998 | if (search_start + num_bytes > |
@@ -7284,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | |||
7284 | root->fs_info->fs_devices->missing_devices; | 7364 | root->fs_info->fs_devices->missing_devices; |
7285 | 7365 | ||
7286 | stripped = BTRFS_BLOCK_GROUP_RAID0 | | 7366 | stripped = BTRFS_BLOCK_GROUP_RAID0 | |
7367 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7287 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | 7368 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; |
7288 | 7369 | ||
7289 | if (num_devices == 1) { | 7370 | if (num_devices == 1) { |
@@ -7837,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7837 | btrfs_release_path(path); | 7918 | btrfs_release_path(path); |
7838 | cache->flags = btrfs_block_group_flags(&cache->item); | 7919 | cache->flags = btrfs_block_group_flags(&cache->item); |
7839 | cache->sectorsize = root->sectorsize; | 7920 | cache->sectorsize = root->sectorsize; |
7840 | 7921 | cache->full_stripe_len = btrfs_full_stripe_len(root, | |
7922 | &root->fs_info->mapping_tree, | ||
7923 | found_key.objectid); | ||
7841 | btrfs_init_free_space_ctl(cache); | 7924 | btrfs_init_free_space_ctl(cache); |
7842 | 7925 | ||
7843 | /* | 7926 | /* |
@@ -7891,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
7891 | if (!(get_alloc_profile(root, space_info->flags) & | 7974 | if (!(get_alloc_profile(root, space_info->flags) & |
7892 | (BTRFS_BLOCK_GROUP_RAID10 | | 7975 | (BTRFS_BLOCK_GROUP_RAID10 | |
7893 | BTRFS_BLOCK_GROUP_RAID1 | | 7976 | BTRFS_BLOCK_GROUP_RAID1 | |
7977 | BTRFS_BLOCK_GROUP_RAID5 | | ||
7978 | BTRFS_BLOCK_GROUP_RAID6 | | ||
7894 | BTRFS_BLOCK_GROUP_DUP))) | 7979 | BTRFS_BLOCK_GROUP_DUP))) |
7895 | continue; | 7980 | continue; |
7896 | /* | 7981 | /* |
@@ -7966,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
7966 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; | 8051 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; |
7967 | cache->sectorsize = root->sectorsize; | 8052 | cache->sectorsize = root->sectorsize; |
7968 | cache->fs_info = root->fs_info; | 8053 | cache->fs_info = root->fs_info; |
8054 | cache->full_stripe_len = btrfs_full_stripe_len(root, | ||
8055 | &root->fs_info->mapping_tree, | ||
8056 | chunk_offset); | ||
7969 | 8057 | ||
7970 | atomic_set(&cache->count, 1); | 8058 | atomic_set(&cache->count, 1); |
7971 | spin_lock_init(&cache->lock); | 8059 | spin_lock_init(&cache->lock); |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5c00d6aeae75..66f999b97cbb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -1895,13 +1895,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | |||
1895 | if (ret) | 1895 | if (ret) |
1896 | err = ret; | 1896 | err = ret; |
1897 | 1897 | ||
1898 | if (did_repair) { | 1898 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, |
1899 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | 1899 | rec->start + rec->len - 1, |
1900 | rec->start + rec->len - 1, | 1900 | EXTENT_DAMAGED, GFP_NOFS); |
1901 | EXTENT_DAMAGED, GFP_NOFS); | 1901 | if (ret && !err) |
1902 | if (ret && !err) | 1902 | err = ret; |
1903 | err = ret; | ||
1904 | } | ||
1905 | 1903 | ||
1906 | kfree(rec); | 1904 | kfree(rec); |
1907 | return err; | 1905 | return err; |
@@ -1932,10 +1930,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
1932 | u64 map_length = 0; | 1930 | u64 map_length = 0; |
1933 | u64 sector; | 1931 | u64 sector; |
1934 | struct btrfs_bio *bbio = NULL; | 1932 | struct btrfs_bio *bbio = NULL; |
1933 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
1935 | int ret; | 1934 | int ret; |
1936 | 1935 | ||
1937 | BUG_ON(!mirror_num); | 1936 | BUG_ON(!mirror_num); |
1938 | 1937 | ||
1938 | /* we can't repair anything in raid56 yet */ | ||
1939 | if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) | ||
1940 | return 0; | ||
1941 | |||
1939 | bio = bio_alloc(GFP_NOFS, 1); | 1942 | bio = bio_alloc(GFP_NOFS, 1); |
1940 | if (!bio) | 1943 | if (!bio) |
1941 | return -EIO; | 1944 | return -EIO; |
@@ -2052,6 +2055,7 @@ static int clean_io_failure(u64 start, struct page *page) | |||
2052 | failrec->failed_mirror); | 2055 | failrec->failed_mirror); |
2053 | did_repair = !ret; | 2056 | did_repair = !ret; |
2054 | } | 2057 | } |
2058 | ret = 0; | ||
2055 | } | 2059 | } |
2056 | 2060 | ||
2057 | out: | 2061 | out: |
@@ -2487,13 +2491,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, | |||
2487 | return ret; | 2491 | return ret; |
2488 | } | 2492 | } |
2489 | 2493 | ||
2490 | static int merge_bio(struct extent_io_tree *tree, struct page *page, | 2494 | static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, |
2491 | unsigned long offset, size_t size, struct bio *bio, | 2495 | unsigned long offset, size_t size, struct bio *bio, |
2492 | unsigned long bio_flags) | 2496 | unsigned long bio_flags) |
2493 | { | 2497 | { |
2494 | int ret = 0; | 2498 | int ret = 0; |
2495 | if (tree->ops && tree->ops->merge_bio_hook) | 2499 | if (tree->ops && tree->ops->merge_bio_hook) |
2496 | ret = tree->ops->merge_bio_hook(page, offset, size, bio, | 2500 | ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, |
2497 | bio_flags); | 2501 | bio_flags); |
2498 | BUG_ON(ret < 0); | 2502 | BUG_ON(ret < 0); |
2499 | return ret; | 2503 | return ret; |
@@ -2528,7 +2532,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, | |||
2528 | sector; | 2532 | sector; |
2529 | 2533 | ||
2530 | if (prev_bio_flags != bio_flags || !contig || | 2534 | if (prev_bio_flags != bio_flags || !contig || |
2531 | merge_bio(tree, page, offset, page_size, bio, bio_flags) || | 2535 | merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || |
2532 | bio_add_page(bio, page, page_size, offset) < page_size) { | 2536 | bio_add_page(bio, page, page_size, offset) < page_size) { |
2533 | ret = submit_one_bio(rw, bio, mirror_num, | 2537 | ret = submit_one_bio(rw, bio, mirror_num, |
2534 | prev_bio_flags); | 2538 | prev_bio_flags); |
@@ -4162,6 +4166,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) | |||
4162 | 4166 | ||
4163 | static void check_buffer_tree_ref(struct extent_buffer *eb) | 4167 | static void check_buffer_tree_ref(struct extent_buffer *eb) |
4164 | { | 4168 | { |
4169 | int refs; | ||
4165 | /* the ref bit is tricky. We have to make sure it is set | 4170 | /* the ref bit is tricky. We have to make sure it is set |
4166 | * if we have the buffer dirty. Otherwise the | 4171 | * if we have the buffer dirty. Otherwise the |
4167 | * code to free a buffer can end up dropping a dirty | 4172 | * code to free a buffer can end up dropping a dirty |
@@ -4182,6 +4187,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) | |||
4182 | * So bump the ref count first, then set the bit. If someone | 4187 | * So bump the ref count first, then set the bit. If someone |
4183 | * beat us to it, drop the ref we added. | 4188 | * beat us to it, drop the ref we added. |
4184 | */ | 4189 | */ |
4190 | refs = atomic_read(&eb->refs); | ||
4191 | if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | ||
4192 | return; | ||
4193 | |||
4185 | spin_lock(&eb->refs_lock); | 4194 | spin_lock(&eb->refs_lock); |
4186 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | 4195 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) |
4187 | atomic_inc(&eb->refs); | 4196 | atomic_inc(&eb->refs); |
@@ -4383,9 +4392,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) | |||
4383 | 4392 | ||
4384 | void free_extent_buffer(struct extent_buffer *eb) | 4393 | void free_extent_buffer(struct extent_buffer *eb) |
4385 | { | 4394 | { |
4395 | int refs; | ||
4396 | int old; | ||
4386 | if (!eb) | 4397 | if (!eb) |
4387 | return; | 4398 | return; |
4388 | 4399 | ||
4400 | while (1) { | ||
4401 | refs = atomic_read(&eb->refs); | ||
4402 | if (refs <= 3) | ||
4403 | break; | ||
4404 | old = atomic_cmpxchg(&eb->refs, refs, refs - 1); | ||
4405 | if (old == refs) | ||
4406 | return; | ||
4407 | } | ||
4408 | |||
4389 | spin_lock(&eb->refs_lock); | 4409 | spin_lock(&eb->refs_lock); |
4390 | if (atomic_read(&eb->refs) == 2 && | 4410 | if (atomic_read(&eb->refs) == 2 && |
4391 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) | 4411 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ff182322d112..dc81868d975a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -72,7 +72,7 @@ struct extent_io_ops { | |||
72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); | 72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); |
73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); | 73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); |
74 | extent_submit_bio_hook_t *submit_bio_hook; | 74 | extent_submit_bio_hook_t *submit_bio_hook; |
75 | int (*merge_bio_hook)(struct page *page, unsigned long offset, | 75 | int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, |
76 | size_t size, struct bio *bio, | 76 | size_t size, struct bio *bio, |
77 | unsigned long bio_flags); | 77 | unsigned long bio_flags); |
78 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); | 78 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c8090f18c217..1f84fc09c1a8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -1465,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, | |||
1465 | } | 1465 | } |
1466 | 1466 | ||
1467 | static struct btrfs_free_space * | 1467 | static struct btrfs_free_space * |
1468 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | 1468 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, |
1469 | unsigned long align) | ||
1469 | { | 1470 | { |
1470 | struct btrfs_free_space *entry; | 1471 | struct btrfs_free_space *entry; |
1471 | struct rb_node *node; | 1472 | struct rb_node *node; |
1473 | u64 ctl_off; | ||
1474 | u64 tmp; | ||
1475 | u64 align_off; | ||
1472 | int ret; | 1476 | int ret; |
1473 | 1477 | ||
1474 | if (!ctl->free_space_offset.rb_node) | 1478 | if (!ctl->free_space_offset.rb_node) |
@@ -1483,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | |||
1483 | if (entry->bytes < *bytes) | 1487 | if (entry->bytes < *bytes) |
1484 | continue; | 1488 | continue; |
1485 | 1489 | ||
1490 | /* make sure the space returned is big enough | ||
1491 | * to match our requested alignment | ||
1492 | */ | ||
1493 | if (*bytes >= align) { | ||
1494 | ctl_off = entry->offset - ctl->start; | ||
1495 | tmp = ctl_off + align - 1;; | ||
1496 | do_div(tmp, align); | ||
1497 | tmp = tmp * align + ctl->start; | ||
1498 | align_off = tmp - entry->offset; | ||
1499 | } else { | ||
1500 | align_off = 0; | ||
1501 | tmp = entry->offset; | ||
1502 | } | ||
1503 | |||
1504 | if (entry->bytes < *bytes + align_off) | ||
1505 | continue; | ||
1506 | |||
1486 | if (entry->bitmap) { | 1507 | if (entry->bitmap) { |
1487 | ret = search_bitmap(ctl, entry, offset, bytes); | 1508 | ret = search_bitmap(ctl, entry, &tmp, bytes); |
1488 | if (!ret) | 1509 | if (!ret) { |
1510 | *offset = tmp; | ||
1489 | return entry; | 1511 | return entry; |
1512 | } | ||
1490 | continue; | 1513 | continue; |
1491 | } | 1514 | } |
1492 | 1515 | ||
1493 | *offset = entry->offset; | 1516 | *offset = tmp; |
1494 | *bytes = entry->bytes; | 1517 | *bytes = entry->bytes - align_off; |
1495 | return entry; | 1518 | return entry; |
1496 | } | 1519 | } |
1497 | 1520 | ||
@@ -2101,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2101 | struct btrfs_free_space *entry = NULL; | 2124 | struct btrfs_free_space *entry = NULL; |
2102 | u64 bytes_search = bytes + empty_size; | 2125 | u64 bytes_search = bytes + empty_size; |
2103 | u64 ret = 0; | 2126 | u64 ret = 0; |
2127 | u64 align_gap = 0; | ||
2128 | u64 align_gap_len = 0; | ||
2104 | 2129 | ||
2105 | spin_lock(&ctl->tree_lock); | 2130 | spin_lock(&ctl->tree_lock); |
2106 | entry = find_free_space(ctl, &offset, &bytes_search); | 2131 | entry = find_free_space(ctl, &offset, &bytes_search, |
2132 | block_group->full_stripe_len); | ||
2107 | if (!entry) | 2133 | if (!entry) |
2108 | goto out; | 2134 | goto out; |
2109 | 2135 | ||
@@ -2113,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2113 | if (!entry->bytes) | 2139 | if (!entry->bytes) |
2114 | free_bitmap(ctl, entry); | 2140 | free_bitmap(ctl, entry); |
2115 | } else { | 2141 | } else { |
2142 | |||
2116 | unlink_free_space(ctl, entry); | 2143 | unlink_free_space(ctl, entry); |
2117 | entry->offset += bytes; | 2144 | align_gap_len = offset - entry->offset; |
2118 | entry->bytes -= bytes; | 2145 | align_gap = entry->offset; |
2146 | |||
2147 | entry->offset = offset + bytes; | ||
2148 | WARN_ON(entry->bytes < bytes + align_gap_len); | ||
2149 | |||
2150 | entry->bytes -= bytes + align_gap_len; | ||
2119 | if (!entry->bytes) | 2151 | if (!entry->bytes) |
2120 | kmem_cache_free(btrfs_free_space_cachep, entry); | 2152 | kmem_cache_free(btrfs_free_space_cachep, entry); |
2121 | else | 2153 | else |
@@ -2125,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
2125 | out: | 2157 | out: |
2126 | spin_unlock(&ctl->tree_lock); | 2158 | spin_unlock(&ctl->tree_lock); |
2127 | 2159 | ||
2160 | if (align_gap_len) | ||
2161 | __btrfs_add_free_space(ctl, align_gap, align_gap_len); | ||
2128 | return ret; | 2162 | return ret; |
2129 | } | 2163 | } |
2130 | 2164 | ||
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1aa98be54ce0..4e6a11c2cfdd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/ratelimit.h> | 40 | #include <linux/ratelimit.h> |
41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
42 | #include <linux/btrfs.h> | 42 | #include <linux/btrfs.h> |
43 | #include <linux/blkdev.h> | ||
43 | #include "compat.h" | 44 | #include "compat.h" |
44 | #include "ctree.h" | 45 | #include "ctree.h" |
45 | #include "disk-io.h" | 46 | #include "disk-io.h" |
@@ -1605,7 +1606,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, | |||
1605 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure | 1606 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure |
1606 | * we don't create bios that span stripes or chunks | 1607 | * we don't create bios that span stripes or chunks |
1607 | */ | 1608 | */ |
1608 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 1609 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
1609 | size_t size, struct bio *bio, | 1610 | size_t size, struct bio *bio, |
1610 | unsigned long bio_flags) | 1611 | unsigned long bio_flags) |
1611 | { | 1612 | { |
@@ -1620,7 +1621,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | |||
1620 | 1621 | ||
1621 | length = bio->bi_size; | 1622 | length = bio->bi_size; |
1622 | map_length = length; | 1623 | map_length = length; |
1623 | ret = btrfs_map_block(root->fs_info, READ, logical, | 1624 | ret = btrfs_map_block(root->fs_info, rw, logical, |
1624 | &map_length, NULL, 0); | 1625 | &map_length, NULL, 0); |
1625 | /* Will always return 0 with map_multi == NULL */ | 1626 | /* Will always return 0 with map_multi == NULL */ |
1626 | BUG_ON(ret < 0); | 1627 | BUG_ON(ret < 0); |
@@ -6464,19 +6465,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6464 | int async_submit = 0; | 6465 | int async_submit = 0; |
6465 | 6466 | ||
6466 | map_length = orig_bio->bi_size; | 6467 | map_length = orig_bio->bi_size; |
6467 | ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, | 6468 | ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, |
6468 | &map_length, NULL, 0); | 6469 | &map_length, NULL, 0); |
6469 | if (ret) { | 6470 | if (ret) { |
6470 | bio_put(orig_bio); | 6471 | bio_put(orig_bio); |
6471 | return -EIO; | 6472 | return -EIO; |
6472 | } | 6473 | } |
6473 | |||
6474 | if (map_length >= orig_bio->bi_size) { | 6474 | if (map_length >= orig_bio->bi_size) { |
6475 | bio = orig_bio; | 6475 | bio = orig_bio; |
6476 | goto submit; | 6476 | goto submit; |
6477 | } | 6477 | } |
6478 | 6478 | ||
6479 | async_submit = 1; | 6479 | /* async crcs make it difficult to collect full stripe writes. */ |
6480 | if (btrfs_get_alloc_profile(root, 1) & | ||
6481 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) | ||
6482 | async_submit = 0; | ||
6483 | else | ||
6484 | async_submit = 1; | ||
6485 | |||
6480 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); | 6486 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); |
6481 | if (!bio) | 6487 | if (!bio) |
6482 | return -ENOMEM; | 6488 | return -ENOMEM; |
@@ -6518,7 +6524,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
6518 | bio->bi_end_io = btrfs_end_dio_bio; | 6524 | bio->bi_end_io = btrfs_end_dio_bio; |
6519 | 6525 | ||
6520 | map_length = orig_bio->bi_size; | 6526 | map_length = orig_bio->bi_size; |
6521 | ret = btrfs_map_block(root->fs_info, READ, | 6527 | ret = btrfs_map_block(root->fs_info, rw, |
6522 | start_sector << 9, | 6528 | start_sector << 9, |
6523 | &map_length, NULL, 0); | 6529 | &map_length, NULL, 0); |
6524 | if (ret) { | 6530 | if (ret) { |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..e34e568534d9 --- /dev/null +++ b/fs/btrfs/raid56.c | |||
@@ -0,0 +1,2080 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/wait.h> | ||
21 | #include <linux/bio.h> | ||
22 | #include <linux/slab.h> | ||
23 | #include <linux/buffer_head.h> | ||
24 | #include <linux/blkdev.h> | ||
25 | #include <linux/random.h> | ||
26 | #include <linux/iocontext.h> | ||
27 | #include <linux/capability.h> | ||
28 | #include <linux/ratelimit.h> | ||
29 | #include <linux/kthread.h> | ||
30 | #include <linux/raid/pq.h> | ||
31 | #include <linux/hash.h> | ||
32 | #include <linux/list_sort.h> | ||
33 | #include <linux/raid/xor.h> | ||
34 | #include <asm/div64.h> | ||
35 | #include "compat.h" | ||
36 | #include "ctree.h" | ||
37 | #include "extent_map.h" | ||
38 | #include "disk-io.h" | ||
39 | #include "transaction.h" | ||
40 | #include "print-tree.h" | ||
41 | #include "volumes.h" | ||
42 | #include "raid56.h" | ||
43 | #include "async-thread.h" | ||
44 | #include "check-integrity.h" | ||
45 | #include "rcu-string.h" | ||
46 | |||
47 | /* set when additional merges to this rbio are not allowed */ | ||
48 | #define RBIO_RMW_LOCKED_BIT 1 | ||
49 | |||
50 | /* | ||
51 | * set when this rbio is sitting in the hash, but it is just a cache | ||
52 | * of past RMW | ||
53 | */ | ||
54 | #define RBIO_CACHE_BIT 2 | ||
55 | |||
56 | /* | ||
57 | * set when it is safe to trust the stripe_pages for caching | ||
58 | */ | ||
59 | #define RBIO_CACHE_READY_BIT 3 | ||
60 | |||
61 | |||
62 | #define RBIO_CACHE_SIZE 1024 | ||
63 | |||
64 | struct btrfs_raid_bio { | ||
65 | struct btrfs_fs_info *fs_info; | ||
66 | struct btrfs_bio *bbio; | ||
67 | |||
68 | /* | ||
69 | * logical block numbers for the start of each stripe | ||
70 | * The last one or two are p/q. These are sorted, | ||
71 | * so raid_map[0] is the start of our full stripe | ||
72 | */ | ||
73 | u64 *raid_map; | ||
74 | |||
75 | /* while we're doing rmw on a stripe | ||
76 | * we put it into a hash table so we can | ||
77 | * lock the stripe and merge more rbios | ||
78 | * into it. | ||
79 | */ | ||
80 | struct list_head hash_list; | ||
81 | |||
82 | /* | ||
83 | * LRU list for the stripe cache | ||
84 | */ | ||
85 | struct list_head stripe_cache; | ||
86 | |||
87 | /* | ||
88 | * for scheduling work in the helper threads | ||
89 | */ | ||
90 | struct btrfs_work work; | ||
91 | |||
92 | /* | ||
93 | * bio list and bio_list_lock are used | ||
94 | * to add more bios into the stripe | ||
95 | * in hopes of avoiding the full rmw | ||
96 | */ | ||
97 | struct bio_list bio_list; | ||
98 | spinlock_t bio_list_lock; | ||
99 | |||
100 | /* also protected by the bio_list_lock, the | ||
101 | * plug list is used by the plugging code | ||
102 | * to collect partial bios while plugged. The | ||
103 | * stripe locking code also uses it to hand off | ||
104 | * the stripe lock to the next pending IO | ||
105 | */ | ||
106 | struct list_head plug_list; | ||
107 | |||
108 | /* | ||
109 | * flags that tell us if it is safe to | ||
110 | * merge with this bio | ||
111 | */ | ||
112 | unsigned long flags; | ||
113 | |||
114 | /* size of each individual stripe on disk */ | ||
115 | int stripe_len; | ||
116 | |||
117 | /* number of data stripes (no p/q) */ | ||
118 | int nr_data; | ||
119 | |||
120 | /* | ||
121 | * set if we're doing a parity rebuild | ||
122 | * for a read from higher up, which is handled | ||
123 | * differently from a parity rebuild as part of | ||
124 | * rmw | ||
125 | */ | ||
126 | int read_rebuild; | ||
127 | |||
128 | /* first bad stripe */ | ||
129 | int faila; | ||
130 | |||
131 | /* second bad stripe (for raid6 use) */ | ||
132 | int failb; | ||
133 | |||
134 | /* | ||
135 | * number of pages needed to represent the full | ||
136 | * stripe | ||
137 | */ | ||
138 | int nr_pages; | ||
139 | |||
140 | /* | ||
141 | * size of all the bios in the bio_list. This | ||
142 | * helps us decide if the rbio maps to a full | ||
143 | * stripe or not | ||
144 | */ | ||
145 | int bio_list_bytes; | ||
146 | |||
147 | atomic_t refs; | ||
148 | |||
149 | /* | ||
150 | * these are two arrays of pointers. We allocate the | ||
151 | * rbio big enough to hold them both and setup their | ||
152 | * locations when the rbio is allocated | ||
153 | */ | ||
154 | |||
155 | /* pointers to pages that we allocated for | ||
156 | * reading/writing stripes directly from the disk (including P/Q) | ||
157 | */ | ||
158 | struct page **stripe_pages; | ||
159 | |||
160 | /* | ||
161 | * pointers to the pages in the bio_list. Stored | ||
162 | * here for faster lookup | ||
163 | */ | ||
164 | struct page **bio_pages; | ||
165 | }; | ||
166 | |||
167 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | ||
168 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio); | ||
169 | static void rmw_work(struct btrfs_work *work); | ||
170 | static void read_rebuild_work(struct btrfs_work *work); | ||
171 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio); | ||
172 | static void async_read_rebuild(struct btrfs_raid_bio *rbio); | ||
173 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); | ||
174 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); | ||
175 | static void __free_raid_bio(struct btrfs_raid_bio *rbio); | ||
176 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | ||
177 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | ||
178 | |||
179 | /* | ||
180 | * the stripe hash table is used for locking, and to collect | ||
181 | * bios in hopes of making a full stripe | ||
182 | */ | ||
183 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) | ||
184 | { | ||
185 | struct btrfs_stripe_hash_table *table; | ||
186 | struct btrfs_stripe_hash_table *x; | ||
187 | struct btrfs_stripe_hash *cur; | ||
188 | struct btrfs_stripe_hash *h; | ||
189 | int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; | ||
190 | int i; | ||
191 | |||
192 | if (info->stripe_hash_table) | ||
193 | return 0; | ||
194 | |||
195 | table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS); | ||
196 | if (!table) | ||
197 | return -ENOMEM; | ||
198 | |||
199 | spin_lock_init(&table->cache_lock); | ||
200 | INIT_LIST_HEAD(&table->stripe_cache); | ||
201 | |||
202 | h = table->table; | ||
203 | |||
204 | for (i = 0; i < num_entries; i++) { | ||
205 | cur = h + i; | ||
206 | INIT_LIST_HEAD(&cur->hash_list); | ||
207 | spin_lock_init(&cur->lock); | ||
208 | init_waitqueue_head(&cur->wait); | ||
209 | } | ||
210 | |||
211 | x = cmpxchg(&info->stripe_hash_table, NULL, table); | ||
212 | if (x) | ||
213 | kfree(x); | ||
214 | return 0; | ||
215 | } | ||
216 | |||
217 | /* | ||
218 | * caching an rbio means to copy anything from the | ||
219 | * bio_pages array into the stripe_pages array. We | ||
220 | * use the page uptodate bit in the stripe cache array | ||
221 | * to indicate if it has valid data | ||
222 | * | ||
223 | * once the caching is done, we set the cache ready | ||
224 | * bit. | ||
225 | */ | ||
226 | static void cache_rbio_pages(struct btrfs_raid_bio *rbio) | ||
227 | { | ||
228 | int i; | ||
229 | char *s; | ||
230 | char *d; | ||
231 | int ret; | ||
232 | |||
233 | ret = alloc_rbio_pages(rbio); | ||
234 | if (ret) | ||
235 | return; | ||
236 | |||
237 | for (i = 0; i < rbio->nr_pages; i++) { | ||
238 | if (!rbio->bio_pages[i]) | ||
239 | continue; | ||
240 | |||
241 | s = kmap(rbio->bio_pages[i]); | ||
242 | d = kmap(rbio->stripe_pages[i]); | ||
243 | |||
244 | memcpy(d, s, PAGE_CACHE_SIZE); | ||
245 | |||
246 | kunmap(rbio->bio_pages[i]); | ||
247 | kunmap(rbio->stripe_pages[i]); | ||
248 | SetPageUptodate(rbio->stripe_pages[i]); | ||
249 | } | ||
250 | set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
251 | } | ||
252 | |||
253 | /* | ||
254 | * we hash on the first logical address of the stripe | ||
255 | */ | ||
256 | static int rbio_bucket(struct btrfs_raid_bio *rbio) | ||
257 | { | ||
258 | u64 num = rbio->raid_map[0]; | ||
259 | |||
260 | /* | ||
261 | * we shift down quite a bit. We're using byte | ||
262 | * addressing, and most of the lower bits are zeros. | ||
263 | * This tends to upset hash_64, and it consistently | ||
264 | * returns just one or two different values. | ||
265 | * | ||
266 | * shifting off the lower bits fixes things. | ||
267 | */ | ||
268 | return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * stealing an rbio means taking all the uptodate pages from the stripe | ||
273 | * array in the source rbio and putting them into the destination rbio | ||
274 | */ | ||
275 | static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) | ||
276 | { | ||
277 | int i; | ||
278 | struct page *s; | ||
279 | struct page *d; | ||
280 | |||
281 | if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) | ||
282 | return; | ||
283 | |||
284 | for (i = 0; i < dest->nr_pages; i++) { | ||
285 | s = src->stripe_pages[i]; | ||
286 | if (!s || !PageUptodate(s)) { | ||
287 | continue; | ||
288 | } | ||
289 | |||
290 | d = dest->stripe_pages[i]; | ||
291 | if (d) | ||
292 | __free_page(d); | ||
293 | |||
294 | dest->stripe_pages[i] = s; | ||
295 | src->stripe_pages[i] = NULL; | ||
296 | } | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * merging means we take the bio_list from the victim and | ||
301 | * splice it into the destination. The victim should | ||
302 | * be discarded afterwards. | ||
303 | * | ||
304 | * must be called with dest->rbio_list_lock held | ||
305 | */ | ||
306 | static void merge_rbio(struct btrfs_raid_bio *dest, | ||
307 | struct btrfs_raid_bio *victim) | ||
308 | { | ||
309 | bio_list_merge(&dest->bio_list, &victim->bio_list); | ||
310 | dest->bio_list_bytes += victim->bio_list_bytes; | ||
311 | bio_list_init(&victim->bio_list); | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * used to prune items that are in the cache. The caller | ||
316 | * must hold the hash table lock. | ||
317 | */ | ||
318 | static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
319 | { | ||
320 | int bucket = rbio_bucket(rbio); | ||
321 | struct btrfs_stripe_hash_table *table; | ||
322 | struct btrfs_stripe_hash *h; | ||
323 | int freeit = 0; | ||
324 | |||
325 | /* | ||
326 | * check the bit again under the hash table lock. | ||
327 | */ | ||
328 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
329 | return; | ||
330 | |||
331 | table = rbio->fs_info->stripe_hash_table; | ||
332 | h = table->table + bucket; | ||
333 | |||
334 | /* hold the lock for the bucket because we may be | ||
335 | * removing it from the hash table | ||
336 | */ | ||
337 | spin_lock(&h->lock); | ||
338 | |||
339 | /* | ||
340 | * hold the lock for the bio list because we need | ||
341 | * to make sure the bio list is empty | ||
342 | */ | ||
343 | spin_lock(&rbio->bio_list_lock); | ||
344 | |||
345 | if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
346 | list_del_init(&rbio->stripe_cache); | ||
347 | table->cache_size -= 1; | ||
348 | freeit = 1; | ||
349 | |||
350 | /* if the bio list isn't empty, this rbio is | ||
351 | * still involved in an IO. We take it out | ||
352 | * of the cache list, and drop the ref that | ||
353 | * was held for the list. | ||
354 | * | ||
355 | * If the bio_list was empty, we also remove | ||
356 | * the rbio from the hash_table, and drop | ||
357 | * the corresponding ref | ||
358 | */ | ||
359 | if (bio_list_empty(&rbio->bio_list)) { | ||
360 | if (!list_empty(&rbio->hash_list)) { | ||
361 | list_del_init(&rbio->hash_list); | ||
362 | atomic_dec(&rbio->refs); | ||
363 | BUG_ON(!list_empty(&rbio->plug_list)); | ||
364 | } | ||
365 | } | ||
366 | } | ||
367 | |||
368 | spin_unlock(&rbio->bio_list_lock); | ||
369 | spin_unlock(&h->lock); | ||
370 | |||
371 | if (freeit) | ||
372 | __free_raid_bio(rbio); | ||
373 | } | ||
374 | |||
375 | /* | ||
376 | * prune a given rbio from the cache | ||
377 | */ | ||
378 | static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
379 | { | ||
380 | struct btrfs_stripe_hash_table *table; | ||
381 | unsigned long flags; | ||
382 | |||
383 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
384 | return; | ||
385 | |||
386 | table = rbio->fs_info->stripe_hash_table; | ||
387 | |||
388 | spin_lock_irqsave(&table->cache_lock, flags); | ||
389 | __remove_rbio_from_cache(rbio); | ||
390 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
391 | } | ||
392 | |||
393 | /* | ||
394 | * remove everything in the cache | ||
395 | */ | ||
396 | void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) | ||
397 | { | ||
398 | struct btrfs_stripe_hash_table *table; | ||
399 | unsigned long flags; | ||
400 | struct btrfs_raid_bio *rbio; | ||
401 | |||
402 | table = info->stripe_hash_table; | ||
403 | |||
404 | spin_lock_irqsave(&table->cache_lock, flags); | ||
405 | while (!list_empty(&table->stripe_cache)) { | ||
406 | rbio = list_entry(table->stripe_cache.next, | ||
407 | struct btrfs_raid_bio, | ||
408 | stripe_cache); | ||
409 | __remove_rbio_from_cache(rbio); | ||
410 | } | ||
411 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
412 | } | ||
413 | |||
414 | /* | ||
415 | * remove all cached entries and free the hash table | ||
416 | * used by unmount | ||
417 | */ | ||
418 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) | ||
419 | { | ||
420 | if (!info->stripe_hash_table) | ||
421 | return; | ||
422 | btrfs_clear_rbio_cache(info); | ||
423 | kfree(info->stripe_hash_table); | ||
424 | info->stripe_hash_table = NULL; | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * insert an rbio into the stripe cache. It | ||
429 | * must have already been prepared by calling | ||
430 | * cache_rbio_pages | ||
431 | * | ||
432 | * If this rbio was already cached, it gets | ||
433 | * moved to the front of the lru. | ||
434 | * | ||
435 | * If the size of the rbio cache is too big, we | ||
436 | * prune an item. | ||
437 | */ | ||
438 | static void cache_rbio(struct btrfs_raid_bio *rbio) | ||
439 | { | ||
440 | struct btrfs_stripe_hash_table *table; | ||
441 | unsigned long flags; | ||
442 | |||
443 | if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) | ||
444 | return; | ||
445 | |||
446 | table = rbio->fs_info->stripe_hash_table; | ||
447 | |||
448 | spin_lock_irqsave(&table->cache_lock, flags); | ||
449 | spin_lock(&rbio->bio_list_lock); | ||
450 | |||
451 | /* bump our ref if we were not in the list before */ | ||
452 | if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
453 | atomic_inc(&rbio->refs); | ||
454 | |||
455 | if (!list_empty(&rbio->stripe_cache)){ | ||
456 | list_move(&rbio->stripe_cache, &table->stripe_cache); | ||
457 | } else { | ||
458 | list_add(&rbio->stripe_cache, &table->stripe_cache); | ||
459 | table->cache_size += 1; | ||
460 | } | ||
461 | |||
462 | spin_unlock(&rbio->bio_list_lock); | ||
463 | |||
464 | if (table->cache_size > RBIO_CACHE_SIZE) { | ||
465 | struct btrfs_raid_bio *found; | ||
466 | |||
467 | found = list_entry(table->stripe_cache.prev, | ||
468 | struct btrfs_raid_bio, | ||
469 | stripe_cache); | ||
470 | |||
471 | if (found != rbio) | ||
472 | __remove_rbio_from_cache(found); | ||
473 | } | ||
474 | |||
475 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
476 | return; | ||
477 | } | ||
478 | |||
479 | /* | ||
480 | * helper function to run the xor_blocks api. It is only | ||
481 | * able to do MAX_XOR_BLOCKS at a time, so we need to | ||
482 | * loop through. | ||
483 | */ | ||
484 | static void run_xor(void **pages, int src_cnt, ssize_t len) | ||
485 | { | ||
486 | int src_off = 0; | ||
487 | int xor_src_cnt = 0; | ||
488 | void *dest = pages[src_cnt]; | ||
489 | |||
490 | while(src_cnt > 0) { | ||
491 | xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); | ||
492 | xor_blocks(xor_src_cnt, len, dest, pages + src_off); | ||
493 | |||
494 | src_cnt -= xor_src_cnt; | ||
495 | src_off += xor_src_cnt; | ||
496 | } | ||
497 | } | ||
498 | |||
499 | /* | ||
500 | * returns true if the bio list inside this rbio | ||
501 | * covers an entire stripe (no rmw required). | ||
502 | * Must be called with the bio list lock held, or | ||
503 | * at a time when you know it is impossible to add | ||
504 | * new bios into the list | ||
505 | */ | ||
506 | static int __rbio_is_full(struct btrfs_raid_bio *rbio) | ||
507 | { | ||
508 | unsigned long size = rbio->bio_list_bytes; | ||
509 | int ret = 1; | ||
510 | |||
511 | if (size != rbio->nr_data * rbio->stripe_len) | ||
512 | ret = 0; | ||
513 | |||
514 | BUG_ON(size > rbio->nr_data * rbio->stripe_len); | ||
515 | return ret; | ||
516 | } | ||
517 | |||
518 | static int rbio_is_full(struct btrfs_raid_bio *rbio) | ||
519 | { | ||
520 | unsigned long flags; | ||
521 | int ret; | ||
522 | |||
523 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
524 | ret = __rbio_is_full(rbio); | ||
525 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
526 | return ret; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * returns 1 if it is safe to merge two rbios together. | ||
531 | * The merging is safe if the two rbios correspond to | ||
532 | * the same stripe and if they are both going in the same | ||
533 | * direction (read vs write), and if neither one is | ||
534 | * locked for final IO | ||
535 | * | ||
536 | * The caller is responsible for locking such that | ||
537 | * rmw_locked is safe to test | ||
538 | */ | ||
539 | static int rbio_can_merge(struct btrfs_raid_bio *last, | ||
540 | struct btrfs_raid_bio *cur) | ||
541 | { | ||
542 | if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || | ||
543 | test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) | ||
544 | return 0; | ||
545 | |||
546 | /* | ||
547 | * we can't merge with cached rbios, since the | ||
548 | * idea is that when we merge the destination | ||
549 | * rbio is going to run our IO for us. We can | ||
550 | * steal from cached rbio's though, other functions | ||
551 | * handle that. | ||
552 | */ | ||
553 | if (test_bit(RBIO_CACHE_BIT, &last->flags) || | ||
554 | test_bit(RBIO_CACHE_BIT, &cur->flags)) | ||
555 | return 0; | ||
556 | |||
557 | if (last->raid_map[0] != | ||
558 | cur->raid_map[0]) | ||
559 | return 0; | ||
560 | |||
561 | /* reads can't merge with writes */ | ||
562 | if (last->read_rebuild != | ||
563 | cur->read_rebuild) { | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | return 1; | ||
568 | } | ||
569 | |||
570 | /* | ||
571 | * helper to index into the pstripe | ||
572 | */ | ||
573 | static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
574 | { | ||
575 | index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
576 | return rbio->stripe_pages[index]; | ||
577 | } | ||
578 | |||
579 | /* | ||
580 | * helper to index into the qstripe, returns null | ||
581 | * if there is no qstripe | ||
582 | */ | ||
583 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
584 | { | ||
585 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | ||
586 | return NULL; | ||
587 | |||
588 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | ||
589 | PAGE_CACHE_SHIFT; | ||
590 | return rbio->stripe_pages[index]; | ||
591 | } | ||
592 | |||
593 | /* | ||
594 | * The first stripe in the table for a logical address | ||
595 | * has the lock. rbios are added in one of three ways: | ||
596 | * | ||
597 | * 1) Nobody has the stripe locked yet. The rbio is given | ||
598 | * the lock and 0 is returned. The caller must start the IO | ||
599 | * themselves. | ||
600 | * | ||
601 | * 2) Someone has the stripe locked, but we're able to merge | ||
602 | * with the lock owner. The rbio is freed and the IO will | ||
603 | * start automatically along with the existing rbio. 1 is returned. | ||
604 | * | ||
605 | * 3) Someone has the stripe locked, but we're not able to merge. | ||
606 | * The rbio is added to the lock owner's plug list, or merged into | ||
607 | * an rbio already on the plug list. When the lock owner unlocks, | ||
608 | * the next rbio on the list is run and the IO is started automatically. | ||
609 | * 1 is returned | ||
610 | * | ||
611 | * If we return 0, the caller still owns the rbio and must continue with | ||
612 | * IO submission. If we return 1, the caller must assume the rbio has | ||
613 | * already been freed. | ||
614 | */ | ||
615 | static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) | ||
616 | { | ||
617 | int bucket = rbio_bucket(rbio); | ||
618 | struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
619 | struct btrfs_raid_bio *cur; | ||
620 | struct btrfs_raid_bio *pending; | ||
621 | unsigned long flags; | ||
622 | DEFINE_WAIT(wait); | ||
623 | struct btrfs_raid_bio *freeit = NULL; | ||
624 | struct btrfs_raid_bio *cache_drop = NULL; | ||
625 | int ret = 0; | ||
626 | int walk = 0; | ||
627 | |||
628 | spin_lock_irqsave(&h->lock, flags); | ||
629 | list_for_each_entry(cur, &h->hash_list, hash_list) { | ||
630 | walk++; | ||
631 | if (cur->raid_map[0] == rbio->raid_map[0]) { | ||
632 | spin_lock(&cur->bio_list_lock); | ||
633 | |||
634 | /* can we steal this cached rbio's pages? */ | ||
635 | if (bio_list_empty(&cur->bio_list) && | ||
636 | list_empty(&cur->plug_list) && | ||
637 | test_bit(RBIO_CACHE_BIT, &cur->flags) && | ||
638 | !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { | ||
639 | list_del_init(&cur->hash_list); | ||
640 | atomic_dec(&cur->refs); | ||
641 | |||
642 | steal_rbio(cur, rbio); | ||
643 | cache_drop = cur; | ||
644 | spin_unlock(&cur->bio_list_lock); | ||
645 | |||
646 | goto lockit; | ||
647 | } | ||
648 | |||
649 | /* can we merge into the lock owner? */ | ||
650 | if (rbio_can_merge(cur, rbio)) { | ||
651 | merge_rbio(cur, rbio); | ||
652 | spin_unlock(&cur->bio_list_lock); | ||
653 | freeit = rbio; | ||
654 | ret = 1; | ||
655 | goto out; | ||
656 | } | ||
657 | |||
658 | |||
659 | /* | ||
660 | * we couldn't merge with the running | ||
661 | * rbio, see if we can merge with the | ||
662 | * pending ones. We don't have to | ||
663 | * check for rmw_locked because there | ||
664 | * is no way they are inside finish_rmw | ||
665 | * right now | ||
666 | */ | ||
667 | list_for_each_entry(pending, &cur->plug_list, | ||
668 | plug_list) { | ||
669 | if (rbio_can_merge(pending, rbio)) { | ||
670 | merge_rbio(pending, rbio); | ||
671 | spin_unlock(&cur->bio_list_lock); | ||
672 | freeit = rbio; | ||
673 | ret = 1; | ||
674 | goto out; | ||
675 | } | ||
676 | } | ||
677 | |||
678 | /* no merging, put us on the tail of the plug list, | ||
679 | * our rbio will be started with the currently | ||
680 | * running rbio unlocks | ||
681 | */ | ||
682 | list_add_tail(&rbio->plug_list, &cur->plug_list); | ||
683 | spin_unlock(&cur->bio_list_lock); | ||
684 | ret = 1; | ||
685 | goto out; | ||
686 | } | ||
687 | } | ||
688 | lockit: | ||
689 | atomic_inc(&rbio->refs); | ||
690 | list_add(&rbio->hash_list, &h->hash_list); | ||
691 | out: | ||
692 | spin_unlock_irqrestore(&h->lock, flags); | ||
693 | if (cache_drop) | ||
694 | remove_rbio_from_cache(cache_drop); | ||
695 | if (freeit) | ||
696 | __free_raid_bio(freeit); | ||
697 | return ret; | ||
698 | } | ||
699 | |||
700 | /* | ||
701 | * called as rmw or parity rebuild is completed. If the plug list has more | ||
702 | * rbios waiting for this stripe, the next one on the list will be started | ||
703 | */ | ||
704 | static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | ||
705 | { | ||
706 | int bucket; | ||
707 | struct btrfs_stripe_hash *h; | ||
708 | unsigned long flags; | ||
709 | int keep_cache = 0; | ||
710 | |||
711 | bucket = rbio_bucket(rbio); | ||
712 | h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
713 | |||
714 | if (list_empty(&rbio->plug_list)) | ||
715 | cache_rbio(rbio); | ||
716 | |||
717 | spin_lock_irqsave(&h->lock, flags); | ||
718 | spin_lock(&rbio->bio_list_lock); | ||
719 | |||
720 | if (!list_empty(&rbio->hash_list)) { | ||
721 | /* | ||
722 | * if we're still cached and there is no other IO | ||
723 | * to perform, just leave this rbio here for others | ||
724 | * to steal from later | ||
725 | */ | ||
726 | if (list_empty(&rbio->plug_list) && | ||
727 | test_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
728 | keep_cache = 1; | ||
729 | clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
730 | BUG_ON(!bio_list_empty(&rbio->bio_list)); | ||
731 | goto done; | ||
732 | } | ||
733 | |||
734 | list_del_init(&rbio->hash_list); | ||
735 | atomic_dec(&rbio->refs); | ||
736 | |||
737 | /* | ||
738 | * we use the plug list to hold all the rbios | ||
739 | * waiting for the chance to lock this stripe. | ||
740 | * hand the lock over to one of them. | ||
741 | */ | ||
742 | if (!list_empty(&rbio->plug_list)) { | ||
743 | struct btrfs_raid_bio *next; | ||
744 | struct list_head *head = rbio->plug_list.next; | ||
745 | |||
746 | next = list_entry(head, struct btrfs_raid_bio, | ||
747 | plug_list); | ||
748 | |||
749 | list_del_init(&rbio->plug_list); | ||
750 | |||
751 | list_add(&next->hash_list, &h->hash_list); | ||
752 | atomic_inc(&next->refs); | ||
753 | spin_unlock(&rbio->bio_list_lock); | ||
754 | spin_unlock_irqrestore(&h->lock, flags); | ||
755 | |||
756 | if (next->read_rebuild) | ||
757 | async_read_rebuild(next); | ||
758 | else { | ||
759 | steal_rbio(rbio, next); | ||
760 | async_rmw_stripe(next); | ||
761 | } | ||
762 | |||
763 | goto done_nolock; | ||
764 | } else if (waitqueue_active(&h->wait)) { | ||
765 | spin_unlock(&rbio->bio_list_lock); | ||
766 | spin_unlock_irqrestore(&h->lock, flags); | ||
767 | wake_up(&h->wait); | ||
768 | goto done_nolock; | ||
769 | } | ||
770 | } | ||
771 | done: | ||
772 | spin_unlock(&rbio->bio_list_lock); | ||
773 | spin_unlock_irqrestore(&h->lock, flags); | ||
774 | |||
775 | done_nolock: | ||
776 | if (!keep_cache) | ||
777 | remove_rbio_from_cache(rbio); | ||
778 | } | ||
779 | |||
780 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | ||
781 | { | ||
782 | int i; | ||
783 | |||
784 | WARN_ON(atomic_read(&rbio->refs) < 0); | ||
785 | if (!atomic_dec_and_test(&rbio->refs)) | ||
786 | return; | ||
787 | |||
788 | WARN_ON(!list_empty(&rbio->stripe_cache)); | ||
789 | WARN_ON(!list_empty(&rbio->hash_list)); | ||
790 | WARN_ON(!bio_list_empty(&rbio->bio_list)); | ||
791 | |||
792 | for (i = 0; i < rbio->nr_pages; i++) { | ||
793 | if (rbio->stripe_pages[i]) { | ||
794 | __free_page(rbio->stripe_pages[i]); | ||
795 | rbio->stripe_pages[i] = NULL; | ||
796 | } | ||
797 | } | ||
798 | kfree(rbio->raid_map); | ||
799 | kfree(rbio->bbio); | ||
800 | kfree(rbio); | ||
801 | } | ||
802 | |||
803 | static void free_raid_bio(struct btrfs_raid_bio *rbio) | ||
804 | { | ||
805 | unlock_stripe(rbio); | ||
806 | __free_raid_bio(rbio); | ||
807 | } | ||
808 | |||
809 | /* | ||
810 | * this frees the rbio and runs through all the bios in the | ||
811 | * bio_list and calls end_io on them | ||
812 | */ | ||
813 | static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | ||
814 | { | ||
815 | struct bio *cur = bio_list_get(&rbio->bio_list); | ||
816 | struct bio *next; | ||
817 | free_raid_bio(rbio); | ||
818 | |||
819 | while (cur) { | ||
820 | next = cur->bi_next; | ||
821 | cur->bi_next = NULL; | ||
822 | if (uptodate) | ||
823 | set_bit(BIO_UPTODATE, &cur->bi_flags); | ||
824 | bio_endio(cur, err); | ||
825 | cur = next; | ||
826 | } | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * end io function used by finish_rmw. When we finally | ||
831 | * get here, we've written a full stripe | ||
832 | */ | ||
833 | static void raid_write_end_io(struct bio *bio, int err) | ||
834 | { | ||
835 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
836 | |||
837 | if (err) | ||
838 | fail_bio_stripe(rbio, bio); | ||
839 | |||
840 | bio_put(bio); | ||
841 | |||
842 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
843 | return; | ||
844 | |||
845 | err = 0; | ||
846 | |||
847 | /* OK, we have read all the stripes we need to. */ | ||
848 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
849 | err = -EIO; | ||
850 | |||
851 | rbio_orig_end_io(rbio, err, 0); | ||
852 | return; | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * the read/modify/write code wants to use the original bio for | ||
857 | * any pages it included, and then use the rbio for everything | ||
858 | * else. This function decides if a given index (stripe number) | ||
859 | * and page number in that stripe fall inside the original bio | ||
860 | * or the rbio. | ||
861 | * | ||
862 | * if you set bio_list_only, you'll get a NULL back for any ranges | ||
863 | * that are outside the bio_list | ||
864 | * | ||
865 | * This doesn't take any refs on anything, you get a bare page pointer | ||
866 | * and the caller must bump refs as required. | ||
867 | * | ||
868 | * You must call index_rbio_pages once before you can trust | ||
869 | * the answers from this function. | ||
870 | */ | ||
871 | static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, | ||
872 | int index, int pagenr, int bio_list_only) | ||
873 | { | ||
874 | int chunk_page; | ||
875 | struct page *p = NULL; | ||
876 | |||
877 | chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; | ||
878 | |||
879 | spin_lock_irq(&rbio->bio_list_lock); | ||
880 | p = rbio->bio_pages[chunk_page]; | ||
881 | spin_unlock_irq(&rbio->bio_list_lock); | ||
882 | |||
883 | if (p || bio_list_only) | ||
884 | return p; | ||
885 | |||
886 | return rbio->stripe_pages[chunk_page]; | ||
887 | } | ||
888 | |||
889 | /* | ||
890 | * number of pages we need for the entire stripe across all the | ||
891 | * drives | ||
892 | */ | ||
893 | static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) | ||
894 | { | ||
895 | unsigned long nr = stripe_len * nr_stripes; | ||
896 | return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
897 | } | ||
898 | |||
899 | /* | ||
900 | * allocation and initial setup for the btrfs_raid_bio. Not | ||
901 | * this does not allocate any pages for rbio->pages. | ||
902 | */ | ||
903 | static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | ||
904 | struct btrfs_bio *bbio, u64 *raid_map, | ||
905 | u64 stripe_len) | ||
906 | { | ||
907 | struct btrfs_raid_bio *rbio; | ||
908 | int nr_data = 0; | ||
909 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | ||
910 | void *p; | ||
911 | |||
912 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | ||
913 | GFP_NOFS); | ||
914 | if (!rbio) { | ||
915 | kfree(raid_map); | ||
916 | kfree(bbio); | ||
917 | return ERR_PTR(-ENOMEM); | ||
918 | } | ||
919 | |||
920 | bio_list_init(&rbio->bio_list); | ||
921 | INIT_LIST_HEAD(&rbio->plug_list); | ||
922 | spin_lock_init(&rbio->bio_list_lock); | ||
923 | INIT_LIST_HEAD(&rbio->stripe_cache); | ||
924 | INIT_LIST_HEAD(&rbio->hash_list); | ||
925 | rbio->bbio = bbio; | ||
926 | rbio->raid_map = raid_map; | ||
927 | rbio->fs_info = root->fs_info; | ||
928 | rbio->stripe_len = stripe_len; | ||
929 | rbio->nr_pages = num_pages; | ||
930 | rbio->faila = -1; | ||
931 | rbio->failb = -1; | ||
932 | atomic_set(&rbio->refs, 1); | ||
933 | |||
934 | /* | ||
935 | * the stripe_pages and bio_pages array point to the extra | ||
936 | * memory we allocated past the end of the rbio | ||
937 | */ | ||
938 | p = rbio + 1; | ||
939 | rbio->stripe_pages = p; | ||
940 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | ||
941 | |||
942 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | ||
943 | nr_data = bbio->num_stripes - 2; | ||
944 | else | ||
945 | nr_data = bbio->num_stripes - 1; | ||
946 | |||
947 | rbio->nr_data = nr_data; | ||
948 | return rbio; | ||
949 | } | ||
950 | |||
951 | /* allocate pages for all the stripes in the bio, including parity */ | ||
952 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) | ||
953 | { | ||
954 | int i; | ||
955 | struct page *page; | ||
956 | |||
957 | for (i = 0; i < rbio->nr_pages; i++) { | ||
958 | if (rbio->stripe_pages[i]) | ||
959 | continue; | ||
960 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
961 | if (!page) | ||
962 | return -ENOMEM; | ||
963 | rbio->stripe_pages[i] = page; | ||
964 | ClearPageUptodate(page); | ||
965 | } | ||
966 | return 0; | ||
967 | } | ||
968 | |||
969 | /* allocate pages for just the p/q stripes */ | ||
970 | static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | ||
971 | { | ||
972 | int i; | ||
973 | struct page *page; | ||
974 | |||
975 | i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
976 | |||
977 | for (; i < rbio->nr_pages; i++) { | ||
978 | if (rbio->stripe_pages[i]) | ||
979 | continue; | ||
980 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
981 | if (!page) | ||
982 | return -ENOMEM; | ||
983 | rbio->stripe_pages[i] = page; | ||
984 | } | ||
985 | return 0; | ||
986 | } | ||
987 | |||
988 | /* | ||
989 | * add a single page from a specific stripe into our list of bios for IO | ||
990 | * this will try to merge into existing bios if possible, and returns | ||
991 | * zero if all went well. | ||
992 | */ | ||
993 | int rbio_add_io_page(struct btrfs_raid_bio *rbio, | ||
994 | struct bio_list *bio_list, | ||
995 | struct page *page, | ||
996 | int stripe_nr, | ||
997 | unsigned long page_index, | ||
998 | unsigned long bio_max_len) | ||
999 | { | ||
1000 | struct bio *last = bio_list->tail; | ||
1001 | u64 last_end = 0; | ||
1002 | int ret; | ||
1003 | struct bio *bio; | ||
1004 | struct btrfs_bio_stripe *stripe; | ||
1005 | u64 disk_start; | ||
1006 | |||
1007 | stripe = &rbio->bbio->stripes[stripe_nr]; | ||
1008 | disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); | ||
1009 | |||
1010 | /* if the device is missing, just fail this stripe */ | ||
1011 | if (!stripe->dev->bdev) | ||
1012 | return fail_rbio_index(rbio, stripe_nr); | ||
1013 | |||
1014 | /* see if we can add this page onto our existing bio */ | ||
1015 | if (last) { | ||
1016 | last_end = (u64)last->bi_sector << 9; | ||
1017 | last_end += last->bi_size; | ||
1018 | |||
1019 | /* | ||
1020 | * we can't merge these if they are from different | ||
1021 | * devices or if they are not contiguous | ||
1022 | */ | ||
1023 | if (last_end == disk_start && stripe->dev->bdev && | ||
1024 | test_bit(BIO_UPTODATE, &last->bi_flags) && | ||
1025 | last->bi_bdev == stripe->dev->bdev) { | ||
1026 | ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); | ||
1027 | if (ret == PAGE_CACHE_SIZE) | ||
1028 | return 0; | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | /* put a new bio on the list */ | ||
1033 | bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); | ||
1034 | if (!bio) | ||
1035 | return -ENOMEM; | ||
1036 | |||
1037 | bio->bi_size = 0; | ||
1038 | bio->bi_bdev = stripe->dev->bdev; | ||
1039 | bio->bi_sector = disk_start >> 9; | ||
1040 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
1041 | |||
1042 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
1043 | bio_list_add(bio_list, bio); | ||
1044 | return 0; | ||
1045 | } | ||
1046 | |||
1047 | /* | ||
1048 | * while we're doing the read/modify/write cycle, we could | ||
1049 | * have errors in reading pages off the disk. This checks | ||
1050 | * for errors and if we're not able to read the page it'll | ||
1051 | * trigger parity reconstruction. The rmw will be finished | ||
1052 | * after we've reconstructed the failed stripes | ||
1053 | */ | ||
1054 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | ||
1055 | { | ||
1056 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
1057 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | ||
1058 | __raid56_parity_recover(rbio); | ||
1059 | } else { | ||
1060 | finish_rmw(rbio); | ||
1061 | } | ||
1062 | } | ||
1063 | |||
1064 | /* | ||
1065 | * these are just the pages from the rbio array, not from anything | ||
1066 | * the FS sent down to us | ||
1067 | */ | ||
1068 | static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) | ||
1069 | { | ||
1070 | int index; | ||
1071 | index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); | ||
1072 | index += page; | ||
1073 | return rbio->stripe_pages[index]; | ||
1074 | } | ||
1075 | |||
1076 | /* | ||
1077 | * helper function to walk our bio list and populate the bio_pages array with | ||
1078 | * the result. This seems expensive, but it is faster than constantly | ||
1079 | * searching through the bio list as we setup the IO in finish_rmw or stripe | ||
1080 | * reconstruction. | ||
1081 | * | ||
1082 | * This must be called before you trust the answers from page_in_rbio | ||
1083 | */ | ||
1084 | static void index_rbio_pages(struct btrfs_raid_bio *rbio) | ||
1085 | { | ||
1086 | struct bio *bio; | ||
1087 | u64 start; | ||
1088 | unsigned long stripe_offset; | ||
1089 | unsigned long page_index; | ||
1090 | struct page *p; | ||
1091 | int i; | ||
1092 | |||
1093 | spin_lock_irq(&rbio->bio_list_lock); | ||
1094 | bio_list_for_each(bio, &rbio->bio_list) { | ||
1095 | start = (u64)bio->bi_sector << 9; | ||
1096 | stripe_offset = start - rbio->raid_map[0]; | ||
1097 | page_index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
1098 | |||
1099 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
1100 | p = bio->bi_io_vec[i].bv_page; | ||
1101 | rbio->bio_pages[page_index + i] = p; | ||
1102 | } | ||
1103 | } | ||
1104 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1105 | } | ||
1106 | |||
1107 | /* | ||
1108 | * this is called from one of two situations. We either | ||
1109 | * have a full stripe from the higher layers, or we've read all | ||
1110 | * the missing bits off disk. | ||
1111 | * | ||
1112 | * This will calculate the parity and then send down any | ||
1113 | * changed blocks. | ||
1114 | */ | ||
1115 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | ||
1116 | { | ||
1117 | struct btrfs_bio *bbio = rbio->bbio; | ||
1118 | void *pointers[bbio->num_stripes]; | ||
1119 | int stripe_len = rbio->stripe_len; | ||
1120 | int nr_data = rbio->nr_data; | ||
1121 | int stripe; | ||
1122 | int pagenr; | ||
1123 | int p_stripe = -1; | ||
1124 | int q_stripe = -1; | ||
1125 | struct bio_list bio_list; | ||
1126 | struct bio *bio; | ||
1127 | int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; | ||
1128 | int ret; | ||
1129 | |||
1130 | bio_list_init(&bio_list); | ||
1131 | |||
1132 | if (bbio->num_stripes - rbio->nr_data == 1) { | ||
1133 | p_stripe = bbio->num_stripes - 1; | ||
1134 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | ||
1135 | p_stripe = bbio->num_stripes - 2; | ||
1136 | q_stripe = bbio->num_stripes - 1; | ||
1137 | } else { | ||
1138 | BUG(); | ||
1139 | } | ||
1140 | |||
1141 | /* at this point we either have a full stripe, | ||
1142 | * or we've read the full stripe from the drive. | ||
1143 | * recalculate the parity and write the new results. | ||
1144 | * | ||
1145 | * We're not allowed to add any new bios to the | ||
1146 | * bio list here, anyone else that wants to | ||
1147 | * change this stripe needs to do their own rmw. | ||
1148 | */ | ||
1149 | spin_lock_irq(&rbio->bio_list_lock); | ||
1150 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
1151 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1152 | |||
1153 | atomic_set(&rbio->bbio->error, 0); | ||
1154 | |||
1155 | /* | ||
1156 | * now that we've set rmw_locked, run through the | ||
1157 | * bio list one last time and map the page pointers | ||
1158 | * | ||
1159 | * We don't cache full rbios because we're assuming | ||
1160 | * the higher layers are unlikely to use this area of | ||
1161 | * the disk again soon. If they do use it again, | ||
1162 | * hopefully they will send another full bio. | ||
1163 | */ | ||
1164 | index_rbio_pages(rbio); | ||
1165 | if (!rbio_is_full(rbio)) | ||
1166 | cache_rbio_pages(rbio); | ||
1167 | else | ||
1168 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
1169 | |||
1170 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1171 | struct page *p; | ||
1172 | /* first collect one page from each data stripe */ | ||
1173 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
1174 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1175 | pointers[stripe] = kmap(p); | ||
1176 | } | ||
1177 | |||
1178 | /* then add the parity stripe */ | ||
1179 | p = rbio_pstripe_page(rbio, pagenr); | ||
1180 | SetPageUptodate(p); | ||
1181 | pointers[stripe++] = kmap(p); | ||
1182 | |||
1183 | if (q_stripe != -1) { | ||
1184 | |||
1185 | /* | ||
1186 | * raid6, add the qstripe and call the | ||
1187 | * library function to fill in our p/q | ||
1188 | */ | ||
1189 | p = rbio_qstripe_page(rbio, pagenr); | ||
1190 | SetPageUptodate(p); | ||
1191 | pointers[stripe++] = kmap(p); | ||
1192 | |||
1193 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | ||
1194 | pointers); | ||
1195 | } else { | ||
1196 | /* raid5 */ | ||
1197 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
1198 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
1199 | } | ||
1200 | |||
1201 | |||
1202 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | ||
1203 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
1204 | } | ||
1205 | |||
1206 | /* | ||
1207 | * time to start writing. Make bios for everything from the | ||
1208 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
1209 | * everything else. | ||
1210 | */ | ||
1211 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
1212 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1213 | struct page *page; | ||
1214 | if (stripe < rbio->nr_data) { | ||
1215 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1216 | if (!page) | ||
1217 | continue; | ||
1218 | } else { | ||
1219 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1220 | } | ||
1221 | |||
1222 | ret = rbio_add_io_page(rbio, &bio_list, | ||
1223 | page, stripe, pagenr, rbio->stripe_len); | ||
1224 | if (ret) | ||
1225 | goto cleanup; | ||
1226 | } | ||
1227 | } | ||
1228 | |||
1229 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | ||
1230 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | ||
1231 | |||
1232 | while (1) { | ||
1233 | bio = bio_list_pop(&bio_list); | ||
1234 | if (!bio) | ||
1235 | break; | ||
1236 | |||
1237 | bio->bi_private = rbio; | ||
1238 | bio->bi_end_io = raid_write_end_io; | ||
1239 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1240 | submit_bio(WRITE, bio); | ||
1241 | } | ||
1242 | return; | ||
1243 | |||
1244 | cleanup: | ||
1245 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1246 | } | ||
1247 | |||
1248 | /* | ||
1249 | * helper to find the stripe number for a given bio. Used to figure out which | ||
1250 | * stripe has failed. This expects the bio to correspond to a physical disk, | ||
1251 | * so it looks up based on physical sector numbers. | ||
1252 | */ | ||
1253 | static int find_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1254 | struct bio *bio) | ||
1255 | { | ||
1256 | u64 physical = bio->bi_sector; | ||
1257 | u64 stripe_start; | ||
1258 | int i; | ||
1259 | struct btrfs_bio_stripe *stripe; | ||
1260 | |||
1261 | physical <<= 9; | ||
1262 | |||
1263 | for (i = 0; i < rbio->bbio->num_stripes; i++) { | ||
1264 | stripe = &rbio->bbio->stripes[i]; | ||
1265 | stripe_start = stripe->physical; | ||
1266 | if (physical >= stripe_start && | ||
1267 | physical < stripe_start + rbio->stripe_len) { | ||
1268 | return i; | ||
1269 | } | ||
1270 | } | ||
1271 | return -1; | ||
1272 | } | ||
1273 | |||
1274 | /* | ||
1275 | * helper to find the stripe number for a given | ||
1276 | * bio (before mapping). Used to figure out which stripe has | ||
1277 | * failed. This looks up based on logical block numbers. | ||
1278 | */ | ||
1279 | static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1280 | struct bio *bio) | ||
1281 | { | ||
1282 | u64 logical = bio->bi_sector; | ||
1283 | u64 stripe_start; | ||
1284 | int i; | ||
1285 | |||
1286 | logical <<= 9; | ||
1287 | |||
1288 | for (i = 0; i < rbio->nr_data; i++) { | ||
1289 | stripe_start = rbio->raid_map[i]; | ||
1290 | if (logical >= stripe_start && | ||
1291 | logical < stripe_start + rbio->stripe_len) { | ||
1292 | return i; | ||
1293 | } | ||
1294 | } | ||
1295 | return -1; | ||
1296 | } | ||
1297 | |||
1298 | /* | ||
1299 | * returns -EIO if we had too many failures | ||
1300 | */ | ||
1301 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | ||
1302 | { | ||
1303 | unsigned long flags; | ||
1304 | int ret = 0; | ||
1305 | |||
1306 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
1307 | |||
1308 | /* we already know this stripe is bad, move on */ | ||
1309 | if (rbio->faila == failed || rbio->failb == failed) | ||
1310 | goto out; | ||
1311 | |||
1312 | if (rbio->faila == -1) { | ||
1313 | /* first failure on this rbio */ | ||
1314 | rbio->faila = failed; | ||
1315 | atomic_inc(&rbio->bbio->error); | ||
1316 | } else if (rbio->failb == -1) { | ||
1317 | /* second failure on this rbio */ | ||
1318 | rbio->failb = failed; | ||
1319 | atomic_inc(&rbio->bbio->error); | ||
1320 | } else { | ||
1321 | ret = -EIO; | ||
1322 | } | ||
1323 | out: | ||
1324 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
1325 | |||
1326 | return ret; | ||
1327 | } | ||
1328 | |||
1329 | /* | ||
1330 | * helper to fail a stripe based on a physical disk | ||
1331 | * bio. | ||
1332 | */ | ||
1333 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, | ||
1334 | struct bio *bio) | ||
1335 | { | ||
1336 | int failed = find_bio_stripe(rbio, bio); | ||
1337 | |||
1338 | if (failed < 0) | ||
1339 | return -EIO; | ||
1340 | |||
1341 | return fail_rbio_index(rbio, failed); | ||
1342 | } | ||
1343 | |||
1344 | /* | ||
1345 | * this sets each page in the bio uptodate. It should only be used on private | ||
1346 | * rbio pages, nothing that comes in from the higher layers | ||
1347 | */ | ||
1348 | static void set_bio_pages_uptodate(struct bio *bio) | ||
1349 | { | ||
1350 | int i; | ||
1351 | struct page *p; | ||
1352 | |||
1353 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
1354 | p = bio->bi_io_vec[i].bv_page; | ||
1355 | SetPageUptodate(p); | ||
1356 | } | ||
1357 | } | ||
1358 | |||
1359 | /* | ||
1360 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
1361 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
1362 | * stripe. | ||
1363 | * | ||
1364 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
1365 | * may trigger parity reconstruction if we had any errors along the way | ||
1366 | */ | ||
1367 | static void raid_rmw_end_io(struct bio *bio, int err) | ||
1368 | { | ||
1369 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1370 | |||
1371 | if (err) | ||
1372 | fail_bio_stripe(rbio, bio); | ||
1373 | else | ||
1374 | set_bio_pages_uptodate(bio); | ||
1375 | |||
1376 | bio_put(bio); | ||
1377 | |||
1378 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1379 | return; | ||
1380 | |||
1381 | err = 0; | ||
1382 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1383 | goto cleanup; | ||
1384 | |||
1385 | /* | ||
1386 | * this will normally call finish_rmw to start our write | ||
1387 | * but if there are any failed stripes we'll reconstruct | ||
1388 | * from parity first | ||
1389 | */ | ||
1390 | validate_rbio_for_rmw(rbio); | ||
1391 | return; | ||
1392 | |||
1393 | cleanup: | ||
1394 | |||
1395 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1396 | } | ||
1397 | |||
1398 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1399 | { | ||
1400 | rbio->work.flags = 0; | ||
1401 | rbio->work.func = rmw_work; | ||
1402 | |||
1403 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1404 | &rbio->work); | ||
1405 | } | ||
1406 | |||
1407 | static void async_read_rebuild(struct btrfs_raid_bio *rbio) | ||
1408 | { | ||
1409 | rbio->work.flags = 0; | ||
1410 | rbio->work.func = read_rebuild_work; | ||
1411 | |||
1412 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
1413 | &rbio->work); | ||
1414 | } | ||
1415 | |||
1416 | /* | ||
1417 | * the stripe must be locked by the caller. It will | ||
1418 | * unlock after all the writes are done | ||
1419 | */ | ||
1420 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
1421 | { | ||
1422 | int bios_to_read = 0; | ||
1423 | struct btrfs_bio *bbio = rbio->bbio; | ||
1424 | struct bio_list bio_list; | ||
1425 | int ret; | ||
1426 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1427 | int pagenr; | ||
1428 | int stripe; | ||
1429 | struct bio *bio; | ||
1430 | |||
1431 | bio_list_init(&bio_list); | ||
1432 | |||
1433 | ret = alloc_rbio_pages(rbio); | ||
1434 | if (ret) | ||
1435 | goto cleanup; | ||
1436 | |||
1437 | index_rbio_pages(rbio); | ||
1438 | |||
1439 | atomic_set(&rbio->bbio->error, 0); | ||
1440 | /* | ||
1441 | * build a list of bios to read all the missing parts of this | ||
1442 | * stripe | ||
1443 | */ | ||
1444 | for (stripe = 0; stripe < rbio->nr_data; stripe++) { | ||
1445 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1446 | struct page *page; | ||
1447 | /* | ||
1448 | * we want to find all the pages missing from | ||
1449 | * the rbio and read them from the disk. If | ||
1450 | * page_in_rbio finds a page in the bio list | ||
1451 | * we don't need to read it off the stripe. | ||
1452 | */ | ||
1453 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1454 | if (page) | ||
1455 | continue; | ||
1456 | |||
1457 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1458 | /* | ||
1459 | * the bio cache may have handed us an uptodate | ||
1460 | * page. If so, be happy and use it | ||
1461 | */ | ||
1462 | if (PageUptodate(page)) | ||
1463 | continue; | ||
1464 | |||
1465 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
1466 | stripe, pagenr, rbio->stripe_len); | ||
1467 | if (ret) | ||
1468 | goto cleanup; | ||
1469 | } | ||
1470 | } | ||
1471 | |||
1472 | bios_to_read = bio_list_size(&bio_list); | ||
1473 | if (!bios_to_read) { | ||
1474 | /* | ||
1475 | * this can happen if others have merged with | ||
1476 | * us, it means there is nothing left to read. | ||
1477 | * But if there are missing devices it may not be | ||
1478 | * safe to do the full stripe write yet. | ||
1479 | */ | ||
1480 | goto finish; | ||
1481 | } | ||
1482 | |||
1483 | /* | ||
1484 | * the bbio may be freed once we submit the last bio. Make sure | ||
1485 | * not to touch it after that | ||
1486 | */ | ||
1487 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
1488 | while (1) { | ||
1489 | bio = bio_list_pop(&bio_list); | ||
1490 | if (!bio) | ||
1491 | break; | ||
1492 | |||
1493 | bio->bi_private = rbio; | ||
1494 | bio->bi_end_io = raid_rmw_end_io; | ||
1495 | |||
1496 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
1497 | BTRFS_WQ_ENDIO_RAID56); | ||
1498 | |||
1499 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1500 | submit_bio(READ, bio); | ||
1501 | } | ||
1502 | /* the actual write will happen once the reads are done */ | ||
1503 | return 0; | ||
1504 | |||
1505 | cleanup: | ||
1506 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1507 | return -EIO; | ||
1508 | |||
1509 | finish: | ||
1510 | validate_rbio_for_rmw(rbio); | ||
1511 | return 0; | ||
1512 | } | ||
1513 | |||
1514 | /* | ||
1515 | * if the upper layers pass in a full stripe, we thank them by only allocating | ||
1516 | * enough pages to hold the parity, and sending it all down quickly. | ||
1517 | */ | ||
1518 | static int full_stripe_write(struct btrfs_raid_bio *rbio) | ||
1519 | { | ||
1520 | int ret; | ||
1521 | |||
1522 | ret = alloc_rbio_parity_pages(rbio); | ||
1523 | if (ret) | ||
1524 | return ret; | ||
1525 | |||
1526 | ret = lock_stripe_add(rbio); | ||
1527 | if (ret == 0) | ||
1528 | finish_rmw(rbio); | ||
1529 | return 0; | ||
1530 | } | ||
1531 | |||
1532 | /* | ||
1533 | * partial stripe writes get handed over to async helpers. | ||
1534 | * We're really hoping to merge a few more writes into this | ||
1535 | * rbio before calculating new parity | ||
1536 | */ | ||
1537 | static int partial_stripe_write(struct btrfs_raid_bio *rbio) | ||
1538 | { | ||
1539 | int ret; | ||
1540 | |||
1541 | ret = lock_stripe_add(rbio); | ||
1542 | if (ret == 0) | ||
1543 | async_rmw_stripe(rbio); | ||
1544 | return 0; | ||
1545 | } | ||
1546 | |||
1547 | /* | ||
1548 | * sometimes while we were reading from the drive to | ||
1549 | * recalculate parity, enough new bios come into create | ||
1550 | * a full stripe. So we do a check here to see if we can | ||
1551 | * go directly to finish_rmw | ||
1552 | */ | ||
1553 | static int __raid56_parity_write(struct btrfs_raid_bio *rbio) | ||
1554 | { | ||
1555 | /* head off into rmw land if we don't have a full stripe */ | ||
1556 | if (!rbio_is_full(rbio)) | ||
1557 | return partial_stripe_write(rbio); | ||
1558 | return full_stripe_write(rbio); | ||
1559 | } | ||
1560 | |||
1561 | /* | ||
1562 | * We use plugging call backs to collect full stripes. | ||
1563 | * Any time we get a partial stripe write while plugged | ||
1564 | * we collect it into a list. When the unplug comes down, | ||
1565 | * we sort the list by logical block number and merge | ||
1566 | * everything we can into the same rbios | ||
1567 | */ | ||
1568 | struct btrfs_plug_cb { | ||
1569 | struct blk_plug_cb cb; | ||
1570 | struct btrfs_fs_info *info; | ||
1571 | struct list_head rbio_list; | ||
1572 | struct btrfs_work work; | ||
1573 | }; | ||
1574 | |||
1575 | /* | ||
1576 | * rbios on the plug list are sorted for easier merging. | ||
1577 | */ | ||
1578 | static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
1579 | { | ||
1580 | struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, | ||
1581 | plug_list); | ||
1582 | struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, | ||
1583 | plug_list); | ||
1584 | u64 a_sector = ra->bio_list.head->bi_sector; | ||
1585 | u64 b_sector = rb->bio_list.head->bi_sector; | ||
1586 | |||
1587 | if (a_sector < b_sector) | ||
1588 | return -1; | ||
1589 | if (a_sector > b_sector) | ||
1590 | return 1; | ||
1591 | return 0; | ||
1592 | } | ||
1593 | |||
1594 | static void run_plug(struct btrfs_plug_cb *plug) | ||
1595 | { | ||
1596 | struct btrfs_raid_bio *cur; | ||
1597 | struct btrfs_raid_bio *last = NULL; | ||
1598 | |||
1599 | /* | ||
1600 | * sort our plug list then try to merge | ||
1601 | * everything we can in hopes of creating full | ||
1602 | * stripes. | ||
1603 | */ | ||
1604 | list_sort(NULL, &plug->rbio_list, plug_cmp); | ||
1605 | while (!list_empty(&plug->rbio_list)) { | ||
1606 | cur = list_entry(plug->rbio_list.next, | ||
1607 | struct btrfs_raid_bio, plug_list); | ||
1608 | list_del_init(&cur->plug_list); | ||
1609 | |||
1610 | if (rbio_is_full(cur)) { | ||
1611 | /* we have a full stripe, send it down */ | ||
1612 | full_stripe_write(cur); | ||
1613 | continue; | ||
1614 | } | ||
1615 | if (last) { | ||
1616 | if (rbio_can_merge(last, cur)) { | ||
1617 | merge_rbio(last, cur); | ||
1618 | __free_raid_bio(cur); | ||
1619 | continue; | ||
1620 | |||
1621 | } | ||
1622 | __raid56_parity_write(last); | ||
1623 | } | ||
1624 | last = cur; | ||
1625 | } | ||
1626 | if (last) { | ||
1627 | __raid56_parity_write(last); | ||
1628 | } | ||
1629 | kfree(plug); | ||
1630 | } | ||
1631 | |||
1632 | /* | ||
1633 | * if the unplug comes from schedule, we have to push the | ||
1634 | * work off to a helper thread | ||
1635 | */ | ||
1636 | static void unplug_work(struct btrfs_work *work) | ||
1637 | { | ||
1638 | struct btrfs_plug_cb *plug; | ||
1639 | plug = container_of(work, struct btrfs_plug_cb, work); | ||
1640 | run_plug(plug); | ||
1641 | } | ||
1642 | |||
1643 | static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) | ||
1644 | { | ||
1645 | struct btrfs_plug_cb *plug; | ||
1646 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
1647 | |||
1648 | if (from_schedule) { | ||
1649 | plug->work.flags = 0; | ||
1650 | plug->work.func = unplug_work; | ||
1651 | btrfs_queue_worker(&plug->info->rmw_workers, | ||
1652 | &plug->work); | ||
1653 | return; | ||
1654 | } | ||
1655 | run_plug(plug); | ||
1656 | } | ||
1657 | |||
1658 | /* | ||
1659 | * our main entry point for writes from the rest of the FS. | ||
1660 | */ | ||
1661 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
1662 | struct btrfs_bio *bbio, u64 *raid_map, | ||
1663 | u64 stripe_len) | ||
1664 | { | ||
1665 | struct btrfs_raid_bio *rbio; | ||
1666 | struct btrfs_plug_cb *plug = NULL; | ||
1667 | struct blk_plug_cb *cb; | ||
1668 | |||
1669 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
1670 | if (IS_ERR(rbio)) { | ||
1671 | kfree(raid_map); | ||
1672 | kfree(bbio); | ||
1673 | return PTR_ERR(rbio); | ||
1674 | } | ||
1675 | bio_list_add(&rbio->bio_list, bio); | ||
1676 | rbio->bio_list_bytes = bio->bi_size; | ||
1677 | |||
1678 | /* | ||
1679 | * don't plug on full rbios, just get them out the door | ||
1680 | * as quickly as we can | ||
1681 | */ | ||
1682 | if (rbio_is_full(rbio)) | ||
1683 | return full_stripe_write(rbio); | ||
1684 | |||
1685 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, | ||
1686 | sizeof(*plug)); | ||
1687 | if (cb) { | ||
1688 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
1689 | if (!plug->info) { | ||
1690 | plug->info = root->fs_info; | ||
1691 | INIT_LIST_HEAD(&plug->rbio_list); | ||
1692 | } | ||
1693 | list_add_tail(&rbio->plug_list, &plug->rbio_list); | ||
1694 | } else { | ||
1695 | return __raid56_parity_write(rbio); | ||
1696 | } | ||
1697 | return 0; | ||
1698 | } | ||
1699 | |||
1700 | /* | ||
1701 | * all parity reconstruction happens here. We've read in everything | ||
1702 | * we can find from the drives and this does the heavy lifting of | ||
1703 | * sorting the good from the bad. | ||
1704 | */ | ||
1705 | static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | ||
1706 | { | ||
1707 | int pagenr, stripe; | ||
1708 | void **pointers; | ||
1709 | int faila = -1, failb = -1; | ||
1710 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1711 | struct page *page; | ||
1712 | int err; | ||
1713 | int i; | ||
1714 | |||
1715 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | ||
1716 | GFP_NOFS); | ||
1717 | if (!pointers) { | ||
1718 | err = -ENOMEM; | ||
1719 | goto cleanup_io; | ||
1720 | } | ||
1721 | |||
1722 | faila = rbio->faila; | ||
1723 | failb = rbio->failb; | ||
1724 | |||
1725 | if (rbio->read_rebuild) { | ||
1726 | spin_lock_irq(&rbio->bio_list_lock); | ||
1727 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
1728 | spin_unlock_irq(&rbio->bio_list_lock); | ||
1729 | } | ||
1730 | |||
1731 | index_rbio_pages(rbio); | ||
1732 | |||
1733 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1734 | /* setup our array of pointers with pages | ||
1735 | * from each stripe | ||
1736 | */ | ||
1737 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1738 | /* | ||
1739 | * if we're rebuilding a read, we have to use | ||
1740 | * pages from the bio list | ||
1741 | */ | ||
1742 | if (rbio->read_rebuild && | ||
1743 | (stripe == faila || stripe == failb)) { | ||
1744 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1745 | } else { | ||
1746 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1747 | } | ||
1748 | pointers[stripe] = kmap(page); | ||
1749 | } | ||
1750 | |||
1751 | /* all raid6 handling here */ | ||
1752 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | ||
1753 | RAID6_Q_STRIPE) { | ||
1754 | |||
1755 | /* | ||
1756 | * single failure, rebuild from parity raid5 | ||
1757 | * style | ||
1758 | */ | ||
1759 | if (failb < 0) { | ||
1760 | if (faila == rbio->nr_data) { | ||
1761 | /* | ||
1762 | * Just the P stripe has failed, without | ||
1763 | * a bad data or Q stripe. | ||
1764 | * TODO, we should redo the xor here. | ||
1765 | */ | ||
1766 | err = -EIO; | ||
1767 | goto cleanup; | ||
1768 | } | ||
1769 | /* | ||
1770 | * a single failure in raid6 is rebuilt | ||
1771 | * in the pstripe code below | ||
1772 | */ | ||
1773 | goto pstripe; | ||
1774 | } | ||
1775 | |||
1776 | /* make sure our ps and qs are in order */ | ||
1777 | if (faila > failb) { | ||
1778 | int tmp = failb; | ||
1779 | failb = faila; | ||
1780 | faila = tmp; | ||
1781 | } | ||
1782 | |||
1783 | /* if the q stripe is failed, do a pstripe reconstruction | ||
1784 | * from the xors. | ||
1785 | * If both the q stripe and the P stripe are failed, we're | ||
1786 | * here due to a crc mismatch and we can't give them the | ||
1787 | * data they want | ||
1788 | */ | ||
1789 | if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { | ||
1790 | if (rbio->raid_map[faila] == RAID5_P_STRIPE) { | ||
1791 | err = -EIO; | ||
1792 | goto cleanup; | ||
1793 | } | ||
1794 | /* | ||
1795 | * otherwise we have one bad data stripe and | ||
1796 | * a good P stripe. raid5! | ||
1797 | */ | ||
1798 | goto pstripe; | ||
1799 | } | ||
1800 | |||
1801 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | ||
1802 | raid6_datap_recov(rbio->bbio->num_stripes, | ||
1803 | PAGE_SIZE, faila, pointers); | ||
1804 | } else { | ||
1805 | raid6_2data_recov(rbio->bbio->num_stripes, | ||
1806 | PAGE_SIZE, faila, failb, | ||
1807 | pointers); | ||
1808 | } | ||
1809 | } else { | ||
1810 | void *p; | ||
1811 | |||
1812 | /* rebuild from P stripe here (raid5 or raid6) */ | ||
1813 | BUG_ON(failb != -1); | ||
1814 | pstripe: | ||
1815 | /* Copy parity block into failed block to start with */ | ||
1816 | memcpy(pointers[faila], | ||
1817 | pointers[rbio->nr_data], | ||
1818 | PAGE_CACHE_SIZE); | ||
1819 | |||
1820 | /* rearrange the pointer array */ | ||
1821 | p = pointers[faila]; | ||
1822 | for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) | ||
1823 | pointers[stripe] = pointers[stripe + 1]; | ||
1824 | pointers[rbio->nr_data - 1] = p; | ||
1825 | |||
1826 | /* xor in the rest */ | ||
1827 | run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); | ||
1828 | } | ||
1829 | /* if we're doing this rebuild as part of an rmw, go through | ||
1830 | * and set all of our private rbio pages in the | ||
1831 | * failed stripes as uptodate. This way finish_rmw will | ||
1832 | * know they can be trusted. If this was a read reconstruction, | ||
1833 | * other endio functions will fiddle the uptodate bits | ||
1834 | */ | ||
1835 | if (!rbio->read_rebuild) { | ||
1836 | for (i = 0; i < nr_pages; i++) { | ||
1837 | if (faila != -1) { | ||
1838 | page = rbio_stripe_page(rbio, faila, i); | ||
1839 | SetPageUptodate(page); | ||
1840 | } | ||
1841 | if (failb != -1) { | ||
1842 | page = rbio_stripe_page(rbio, failb, i); | ||
1843 | SetPageUptodate(page); | ||
1844 | } | ||
1845 | } | ||
1846 | } | ||
1847 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
1848 | /* | ||
1849 | * if we're rebuilding a read, we have to use | ||
1850 | * pages from the bio list | ||
1851 | */ | ||
1852 | if (rbio->read_rebuild && | ||
1853 | (stripe == faila || stripe == failb)) { | ||
1854 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
1855 | } else { | ||
1856 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1857 | } | ||
1858 | kunmap(page); | ||
1859 | } | ||
1860 | } | ||
1861 | |||
1862 | err = 0; | ||
1863 | cleanup: | ||
1864 | kfree(pointers); | ||
1865 | |||
1866 | cleanup_io: | ||
1867 | |||
1868 | if (rbio->read_rebuild) { | ||
1869 | if (err == 0) | ||
1870 | cache_rbio_pages(rbio); | ||
1871 | else | ||
1872 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
1873 | |||
1874 | rbio_orig_end_io(rbio, err, err == 0); | ||
1875 | } else if (err == 0) { | ||
1876 | rbio->faila = -1; | ||
1877 | rbio->failb = -1; | ||
1878 | finish_rmw(rbio); | ||
1879 | } else { | ||
1880 | rbio_orig_end_io(rbio, err, 0); | ||
1881 | } | ||
1882 | } | ||
1883 | |||
1884 | /* | ||
1885 | * This is called only for stripes we've read from disk to | ||
1886 | * reconstruct the parity. | ||
1887 | */ | ||
1888 | static void raid_recover_end_io(struct bio *bio, int err) | ||
1889 | { | ||
1890 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
1891 | |||
1892 | /* | ||
1893 | * we only read stripe pages off the disk, set them | ||
1894 | * up to date if there were no errors | ||
1895 | */ | ||
1896 | if (err) | ||
1897 | fail_bio_stripe(rbio, bio); | ||
1898 | else | ||
1899 | set_bio_pages_uptodate(bio); | ||
1900 | bio_put(bio); | ||
1901 | |||
1902 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
1903 | return; | ||
1904 | |||
1905 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
1906 | rbio_orig_end_io(rbio, -EIO, 0); | ||
1907 | else | ||
1908 | __raid_recover_end_io(rbio); | ||
1909 | } | ||
1910 | |||
1911 | /* | ||
1912 | * reads everything we need off the disk to reconstruct | ||
1913 | * the parity. endio handlers trigger final reconstruction | ||
1914 | * when the IO is done. | ||
1915 | * | ||
1916 | * This is used both for reads from the higher layers and for | ||
1917 | * parity construction required to finish a rmw cycle. | ||
1918 | */ | ||
1919 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | ||
1920 | { | ||
1921 | int bios_to_read = 0; | ||
1922 | struct btrfs_bio *bbio = rbio->bbio; | ||
1923 | struct bio_list bio_list; | ||
1924 | int ret; | ||
1925 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1926 | int pagenr; | ||
1927 | int stripe; | ||
1928 | struct bio *bio; | ||
1929 | |||
1930 | bio_list_init(&bio_list); | ||
1931 | |||
1932 | ret = alloc_rbio_pages(rbio); | ||
1933 | if (ret) | ||
1934 | goto cleanup; | ||
1935 | |||
1936 | atomic_set(&rbio->bbio->error, 0); | ||
1937 | |||
1938 | /* | ||
1939 | * read everything that hasn't failed. Thanks to the | ||
1940 | * stripe cache, it is possible that some or all of these | ||
1941 | * pages are going to be uptodate. | ||
1942 | */ | ||
1943 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
1944 | if (rbio->faila == stripe || | ||
1945 | rbio->failb == stripe) | ||
1946 | continue; | ||
1947 | |||
1948 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
1949 | struct page *p; | ||
1950 | |||
1951 | /* | ||
1952 | * the rmw code may have already read this | ||
1953 | * page in | ||
1954 | */ | ||
1955 | p = rbio_stripe_page(rbio, stripe, pagenr); | ||
1956 | if (PageUptodate(p)) | ||
1957 | continue; | ||
1958 | |||
1959 | ret = rbio_add_io_page(rbio, &bio_list, | ||
1960 | rbio_stripe_page(rbio, stripe, pagenr), | ||
1961 | stripe, pagenr, rbio->stripe_len); | ||
1962 | if (ret < 0) | ||
1963 | goto cleanup; | ||
1964 | } | ||
1965 | } | ||
1966 | |||
1967 | bios_to_read = bio_list_size(&bio_list); | ||
1968 | if (!bios_to_read) { | ||
1969 | /* | ||
1970 | * we might have no bios to read just because the pages | ||
1971 | * were up to date, or we might have no bios to read because | ||
1972 | * the devices were gone. | ||
1973 | */ | ||
1974 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | ||
1975 | __raid_recover_end_io(rbio); | ||
1976 | goto out; | ||
1977 | } else { | ||
1978 | goto cleanup; | ||
1979 | } | ||
1980 | } | ||
1981 | |||
1982 | /* | ||
1983 | * the bbio may be freed once we submit the last bio. Make sure | ||
1984 | * not to touch it after that | ||
1985 | */ | ||
1986 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
1987 | while (1) { | ||
1988 | bio = bio_list_pop(&bio_list); | ||
1989 | if (!bio) | ||
1990 | break; | ||
1991 | |||
1992 | bio->bi_private = rbio; | ||
1993 | bio->bi_end_io = raid_recover_end_io; | ||
1994 | |||
1995 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
1996 | BTRFS_WQ_ENDIO_RAID56); | ||
1997 | |||
1998 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
1999 | submit_bio(READ, bio); | ||
2000 | } | ||
2001 | out: | ||
2002 | return 0; | ||
2003 | |||
2004 | cleanup: | ||
2005 | if (rbio->read_rebuild) | ||
2006 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2007 | return -EIO; | ||
2008 | } | ||
2009 | |||
2010 | /* | ||
2011 | * the main entry point for reads from the higher layers. This | ||
2012 | * is really only called when the normal read path had a failure, | ||
2013 | * so we assume the bio they send down corresponds to a failed part | ||
2014 | * of the drive. | ||
2015 | */ | ||
2016 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
2017 | struct btrfs_bio *bbio, u64 *raid_map, | ||
2018 | u64 stripe_len, int mirror_num) | ||
2019 | { | ||
2020 | struct btrfs_raid_bio *rbio; | ||
2021 | int ret; | ||
2022 | |||
2023 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
2024 | if (IS_ERR(rbio)) { | ||
2025 | return PTR_ERR(rbio); | ||
2026 | } | ||
2027 | |||
2028 | rbio->read_rebuild = 1; | ||
2029 | bio_list_add(&rbio->bio_list, bio); | ||
2030 | rbio->bio_list_bytes = bio->bi_size; | ||
2031 | |||
2032 | rbio->faila = find_logical_bio_stripe(rbio, bio); | ||
2033 | if (rbio->faila == -1) { | ||
2034 | BUG(); | ||
2035 | kfree(rbio); | ||
2036 | return -EIO; | ||
2037 | } | ||
2038 | |||
2039 | /* | ||
2040 | * reconstruct from the q stripe if they are | ||
2041 | * asking for mirror 3 | ||
2042 | */ | ||
2043 | if (mirror_num == 3) | ||
2044 | rbio->failb = bbio->num_stripes - 2; | ||
2045 | |||
2046 | ret = lock_stripe_add(rbio); | ||
2047 | |||
2048 | /* | ||
2049 | * __raid56_parity_recover will end the bio with | ||
2050 | * any errors it hits. We don't want to return | ||
2051 | * its error value up the stack because our caller | ||
2052 | * will end up calling bio_endio with any nonzero | ||
2053 | * return | ||
2054 | */ | ||
2055 | if (ret == 0) | ||
2056 | __raid56_parity_recover(rbio); | ||
2057 | /* | ||
2058 | * our rbio has been added to the list of | ||
2059 | * rbios that will be handled after the | ||
2060 | * currently lock owner is done | ||
2061 | */ | ||
2062 | return 0; | ||
2063 | |||
2064 | } | ||
2065 | |||
2066 | static void rmw_work(struct btrfs_work *work) | ||
2067 | { | ||
2068 | struct btrfs_raid_bio *rbio; | ||
2069 | |||
2070 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2071 | raid56_rmw_stripe(rbio); | ||
2072 | } | ||
2073 | |||
2074 | static void read_rebuild_work(struct btrfs_work *work) | ||
2075 | { | ||
2076 | struct btrfs_raid_bio *rbio; | ||
2077 | |||
2078 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2079 | __raid56_parity_recover(rbio); | ||
2080 | } | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h | |||
@@ -0,0 +1,51 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or | ||
6 | * modify it under the terms of the GNU General Public | ||
7 | * License v2 as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
12 | * General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public | ||
15 | * License along with this program; if not, write to the | ||
16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
17 | * Boston, MA 021110-1307, USA. | ||
18 | */ | ||
19 | |||
20 | #ifndef __BTRFS_RAID56__ | ||
21 | #define __BTRFS_RAID56__ | ||
22 | static inline int nr_parity_stripes(struct map_lookup *map) | ||
23 | { | ||
24 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
25 | return 1; | ||
26 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
27 | return 2; | ||
28 | else | ||
29 | return 0; | ||
30 | } | ||
31 | |||
32 | static inline int nr_data_stripes(struct map_lookup *map) | ||
33 | { | ||
34 | return map->num_stripes - nr_parity_stripes(map); | ||
35 | } | ||
36 | #define RAID5_P_STRIPE ((u64)-2) | ||
37 | #define RAID6_Q_STRIPE ((u64)-1) | ||
38 | |||
39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ | ||
40 | ((x) == RAID6_Q_STRIPE)) | ||
41 | |||
42 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
43 | struct btrfs_bio *bbio, u64 *raid_map, | ||
44 | u64 stripe_len, int mirror_num); | ||
45 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
46 | struct btrfs_bio *bbio, u64 *raid_map, | ||
47 | u64 stripe_len); | ||
48 | |||
49 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); | ||
50 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); | ||
51 | #endif | ||
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c78b2a3fc335..53c3501fa4ca 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include "dev-replace.h" | 28 | #include "dev-replace.h" |
29 | #include "check-integrity.h" | 29 | #include "check-integrity.h" |
30 | #include "rcu-string.h" | 30 | #include "rcu-string.h" |
31 | #include "raid56.h" | ||
31 | 32 | ||
32 | /* | 33 | /* |
33 | * This is only the first step towards a full-features scrub. It reads all | 34 | * This is only the first step towards a full-features scrub. It reads all |
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2254 | struct btrfs_device *extent_dev; | 2255 | struct btrfs_device *extent_dev; |
2255 | int extent_mirror_num; | 2256 | int extent_mirror_num; |
2256 | 2257 | ||
2258 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
2259 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
2260 | if (num >= nr_data_stripes(map)) { | ||
2261 | return 0; | ||
2262 | } | ||
2263 | } | ||
2264 | |||
2257 | nstripes = length; | 2265 | nstripes = length; |
2258 | offset = 0; | 2266 | offset = 0; |
2259 | do_div(nstripes, map->stripe_len); | 2267 | do_div(nstripes, map->stripe_len); |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 955204ca0447..a83d486cc70c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -167,6 +167,9 @@ loop: | |||
167 | 167 | ||
168 | spin_lock_init(&cur_trans->commit_lock); | 168 | spin_lock_init(&cur_trans->commit_lock); |
169 | spin_lock_init(&cur_trans->delayed_refs.lock); | 169 | spin_lock_init(&cur_trans->delayed_refs.lock); |
170 | atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); | ||
171 | atomic_set(&cur_trans->delayed_refs.ref_seq, 0); | ||
172 | init_waitqueue_head(&cur_trans->delayed_refs.wait); | ||
170 | 173 | ||
171 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | 174 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); |
172 | INIT_LIST_HEAD(&cur_trans->ordered_operations); | 175 | INIT_LIST_HEAD(&cur_trans->ordered_operations); |
@@ -637,7 +640,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
637 | if (!list_empty(&trans->new_bgs)) | 640 | if (!list_empty(&trans->new_bgs)) |
638 | btrfs_create_pending_block_groups(trans, root); | 641 | btrfs_create_pending_block_groups(trans, root); |
639 | 642 | ||
640 | while (count < 2) { | 643 | while (count < 1) { |
641 | unsigned long cur = trans->delayed_ref_updates; | 644 | unsigned long cur = trans->delayed_ref_updates; |
642 | trans->delayed_ref_updates = 0; | 645 | trans->delayed_ref_updates = 0; |
643 | if (cur && | 646 | if (cur && |
@@ -649,6 +652,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
649 | } | 652 | } |
650 | count++; | 653 | count++; |
651 | } | 654 | } |
655 | |||
652 | btrfs_trans_release_metadata(trans, root); | 656 | btrfs_trans_release_metadata(trans, root); |
653 | trans->block_rsv = NULL; | 657 | trans->block_rsv = NULL; |
654 | 658 | ||
@@ -744,7 +748,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
744 | struct extent_state *cached_state = NULL; | 748 | struct extent_state *cached_state = NULL; |
745 | u64 start = 0; | 749 | u64 start = 0; |
746 | u64 end; | 750 | u64 end; |
751 | struct blk_plug plug; | ||
747 | 752 | ||
753 | blk_start_plug(&plug); | ||
748 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 754 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
749 | mark, &cached_state)) { | 755 | mark, &cached_state)) { |
750 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 756 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, |
@@ -758,6 +764,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
758 | } | 764 | } |
759 | if (err) | 765 | if (err) |
760 | werr = err; | 766 | werr = err; |
767 | blk_finish_plug(&plug); | ||
761 | return werr; | 768 | return werr; |
762 | } | 769 | } |
763 | 770 | ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 72b1cf1b2b5e..7992dc4ea4cc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -25,6 +25,8 @@ | |||
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
28 | #include <linux/raid/pq.h> | ||
29 | #include <asm/div64.h> | ||
28 | #include "compat.h" | 30 | #include "compat.h" |
29 | #include "ctree.h" | 31 | #include "ctree.h" |
30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
@@ -32,6 +34,7 @@ | |||
32 | #include "transaction.h" | 34 | #include "transaction.h" |
33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
34 | #include "volumes.h" | 36 | #include "volumes.h" |
37 | #include "raid56.h" | ||
35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
@@ -1465,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
1465 | goto out; | 1468 | goto out; |
1466 | } | 1469 | } |
1467 | 1470 | ||
1471 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
1472 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
1473 | printk(KERN_ERR "btrfs: unable to go below two " | ||
1474 | "devices on raid5\n"); | ||
1475 | ret = -EINVAL; | ||
1476 | goto out; | ||
1477 | } | ||
1478 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
1479 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
1480 | printk(KERN_ERR "btrfs: unable to go below three " | ||
1481 | "devices on raid6\n"); | ||
1482 | ret = -EINVAL; | ||
1483 | goto out; | ||
1484 | } | ||
1485 | |||
1468 | if (strcmp(device_path, "missing") == 0) { | 1486 | if (strcmp(device_path, "missing") == 0) { |
1469 | struct list_head *devices; | 1487 | struct list_head *devices; |
1470 | struct btrfs_device *tmp; | 1488 | struct btrfs_device *tmp; |
@@ -2726,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
2726 | return 0; | 2744 | return 0; |
2727 | 2745 | ||
2728 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2746 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
2729 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2747 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
2730 | factor = 2; | 2748 | factor = num_stripes / 2; |
2731 | else | 2749 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
2732 | factor = 1; | 2750 | factor = num_stripes - 1; |
2733 | factor = num_stripes / factor; | 2751 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
2752 | factor = num_stripes - 2; | ||
2753 | } else { | ||
2754 | factor = num_stripes; | ||
2755 | } | ||
2734 | 2756 | ||
2735 | for (i = 0; i < num_stripes; i++) { | 2757 | for (i = 0; i < num_stripes; i++) { |
2736 | stripe = btrfs_stripe_nr(chunk, i); | 2758 | stripe = btrfs_stripe_nr(chunk, i); |
@@ -3090,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3090 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3112 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
3091 | else | 3113 | else |
3092 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3114 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
3093 | BTRFS_BLOCK_GROUP_RAID10); | 3115 | BTRFS_BLOCK_GROUP_RAID10 | |
3116 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3117 | BTRFS_BLOCK_GROUP_RAID6); | ||
3094 | 3118 | ||
3095 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3119 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
3096 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3120 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
@@ -3130,7 +3154,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3130 | 3154 | ||
3131 | /* allow to reduce meta or sys integrity only if force set */ | 3155 | /* allow to reduce meta or sys integrity only if force set */ |
3132 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3156 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
3133 | BTRFS_BLOCK_GROUP_RAID10; | 3157 | BTRFS_BLOCK_GROUP_RAID10 | |
3158 | BTRFS_BLOCK_GROUP_RAID5 | | ||
3159 | BTRFS_BLOCK_GROUP_RAID6; | ||
3134 | do { | 3160 | do { |
3135 | seq = read_seqbegin(&fs_info->profiles_lock); | 3161 | seq = read_seqbegin(&fs_info->profiles_lock); |
3136 | 3162 | ||
@@ -3204,11 +3230,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
3204 | update_ioctl_balance_args(fs_info, 0, bargs); | 3230 | update_ioctl_balance_args(fs_info, 0, bargs); |
3205 | } | 3231 | } |
3206 | 3232 | ||
3207 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
3208 | balance_need_close(fs_info)) { | ||
3209 | __cancel_balance(fs_info); | ||
3210 | } | ||
3211 | |||
3212 | wake_up(&fs_info->balance_wait_q); | 3233 | wake_up(&fs_info->balance_wait_q); |
3213 | 3234 | ||
3214 | return ret; | 3235 | return ret; |
@@ -3611,8 +3632,46 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | |||
3611 | .devs_increment = 1, | 3632 | .devs_increment = 1, |
3612 | .ncopies = 1, | 3633 | .ncopies = 1, |
3613 | }, | 3634 | }, |
3635 | [BTRFS_RAID_RAID5] = { | ||
3636 | .sub_stripes = 1, | ||
3637 | .dev_stripes = 1, | ||
3638 | .devs_max = 0, | ||
3639 | .devs_min = 2, | ||
3640 | .devs_increment = 1, | ||
3641 | .ncopies = 2, | ||
3642 | }, | ||
3643 | [BTRFS_RAID_RAID6] = { | ||
3644 | .sub_stripes = 1, | ||
3645 | .dev_stripes = 1, | ||
3646 | .devs_max = 0, | ||
3647 | .devs_min = 3, | ||
3648 | .devs_increment = 1, | ||
3649 | .ncopies = 3, | ||
3650 | }, | ||
3614 | }; | 3651 | }; |
3615 | 3652 | ||
3653 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
3654 | { | ||
3655 | /* TODO allow them to set a preferred stripe size */ | ||
3656 | return 64 * 1024; | ||
3657 | } | ||
3658 | |||
3659 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
3660 | { | ||
3661 | u64 features; | ||
3662 | |||
3663 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
3664 | return; | ||
3665 | |||
3666 | features = btrfs_super_incompat_flags(info->super_copy); | ||
3667 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
3668 | return; | ||
3669 | |||
3670 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
3671 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
3672 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
3673 | } | ||
3674 | |||
3616 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3675 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
3617 | struct btrfs_root *extent_root, | 3676 | struct btrfs_root *extent_root, |
3618 | struct map_lookup **map_ret, | 3677 | struct map_lookup **map_ret, |
@@ -3628,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3628 | struct btrfs_device_info *devices_info = NULL; | 3687 | struct btrfs_device_info *devices_info = NULL; |
3629 | u64 total_avail; | 3688 | u64 total_avail; |
3630 | int num_stripes; /* total number of stripes to allocate */ | 3689 | int num_stripes; /* total number of stripes to allocate */ |
3690 | int data_stripes; /* number of stripes that count for | ||
3691 | block group size */ | ||
3631 | int sub_stripes; /* sub_stripes info for map */ | 3692 | int sub_stripes; /* sub_stripes info for map */ |
3632 | int dev_stripes; /* stripes per dev */ | 3693 | int dev_stripes; /* stripes per dev */ |
3633 | int devs_max; /* max devs to use */ | 3694 | int devs_max; /* max devs to use */ |
@@ -3639,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3639 | u64 max_chunk_size; | 3700 | u64 max_chunk_size; |
3640 | u64 stripe_size; | 3701 | u64 stripe_size; |
3641 | u64 num_bytes; | 3702 | u64 num_bytes; |
3703 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
3642 | int ndevs; | 3704 | int ndevs; |
3643 | int i; | 3705 | int i; |
3644 | int j; | 3706 | int j; |
@@ -3768,16 +3830,31 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3768 | stripe_size = devices_info[ndevs-1].max_avail; | 3830 | stripe_size = devices_info[ndevs-1].max_avail; |
3769 | num_stripes = ndevs * dev_stripes; | 3831 | num_stripes = ndevs * dev_stripes; |
3770 | 3832 | ||
3833 | /* | ||
3834 | * this will have to be fixed for RAID1 and RAID10 over | ||
3835 | * more drives | ||
3836 | */ | ||
3837 | data_stripes = num_stripes / ncopies; | ||
3838 | |||
3771 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3839 | if (stripe_size * ndevs > max_chunk_size * ncopies) { |
3772 | stripe_size = max_chunk_size * ncopies; | 3840 | stripe_size = max_chunk_size * ncopies; |
3773 | do_div(stripe_size, ndevs); | 3841 | do_div(stripe_size, ndevs); |
3774 | } | 3842 | } |
3775 | 3843 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | |
3844 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
3845 | btrfs_super_stripesize(info->super_copy)); | ||
3846 | data_stripes = num_stripes - 1; | ||
3847 | } | ||
3848 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
3849 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
3850 | btrfs_super_stripesize(info->super_copy)); | ||
3851 | data_stripes = num_stripes - 2; | ||
3852 | } | ||
3776 | do_div(stripe_size, dev_stripes); | 3853 | do_div(stripe_size, dev_stripes); |
3777 | 3854 | ||
3778 | /* align to BTRFS_STRIPE_LEN */ | 3855 | /* align to BTRFS_STRIPE_LEN */ |
3779 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3856 | do_div(stripe_size, raid_stripe_len); |
3780 | stripe_size *= BTRFS_STRIPE_LEN; | 3857 | stripe_size *= raid_stripe_len; |
3781 | 3858 | ||
3782 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3859 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
3783 | if (!map) { | 3860 | if (!map) { |
@@ -3795,14 +3872,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3795 | } | 3872 | } |
3796 | } | 3873 | } |
3797 | map->sector_size = extent_root->sectorsize; | 3874 | map->sector_size = extent_root->sectorsize; |
3798 | map->stripe_len = BTRFS_STRIPE_LEN; | 3875 | map->stripe_len = raid_stripe_len; |
3799 | map->io_align = BTRFS_STRIPE_LEN; | 3876 | map->io_align = raid_stripe_len; |
3800 | map->io_width = BTRFS_STRIPE_LEN; | 3877 | map->io_width = raid_stripe_len; |
3801 | map->type = type; | 3878 | map->type = type; |
3802 | map->sub_stripes = sub_stripes; | 3879 | map->sub_stripes = sub_stripes; |
3803 | 3880 | ||
3804 | *map_ret = map; | 3881 | *map_ret = map; |
3805 | num_bytes = stripe_size * (num_stripes / ncopies); | 3882 | num_bytes = stripe_size * data_stripes; |
3806 | 3883 | ||
3807 | *stripe_size_out = stripe_size; | 3884 | *stripe_size_out = stripe_size; |
3808 | *num_bytes_out = num_bytes; | 3885 | *num_bytes_out = num_bytes; |
@@ -3853,6 +3930,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
3853 | } | 3930 | } |
3854 | 3931 | ||
3855 | free_extent_map(em); | 3932 | free_extent_map(em); |
3933 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
3934 | |||
3856 | kfree(devices_info); | 3935 | kfree(devices_info); |
3857 | return 0; | 3936 | return 0; |
3858 | 3937 | ||
@@ -4136,6 +4215,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4136 | ret = map->num_stripes; | 4215 | ret = map->num_stripes; |
4137 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4216 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4138 | ret = map->sub_stripes; | 4217 | ret = map->sub_stripes; |
4218 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
4219 | ret = 2; | ||
4220 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4221 | ret = 3; | ||
4139 | else | 4222 | else |
4140 | ret = 1; | 4223 | ret = 1; |
4141 | free_extent_map(em); | 4224 | free_extent_map(em); |
@@ -4148,6 +4231,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
4148 | return ret; | 4231 | return ret; |
4149 | } | 4232 | } |
4150 | 4233 | ||
4234 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
4235 | struct btrfs_mapping_tree *map_tree, | ||
4236 | u64 logical) | ||
4237 | { | ||
4238 | struct extent_map *em; | ||
4239 | struct map_lookup *map; | ||
4240 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4241 | unsigned long len = root->sectorsize; | ||
4242 | |||
4243 | read_lock(&em_tree->lock); | ||
4244 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4245 | read_unlock(&em_tree->lock); | ||
4246 | BUG_ON(!em); | ||
4247 | |||
4248 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4249 | map = (struct map_lookup *)em->bdev; | ||
4250 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4251 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4252 | len = map->stripe_len * nr_data_stripes(map); | ||
4253 | } | ||
4254 | free_extent_map(em); | ||
4255 | return len; | ||
4256 | } | ||
4257 | |||
4258 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
4259 | u64 logical, u64 len, int mirror_num) | ||
4260 | { | ||
4261 | struct extent_map *em; | ||
4262 | struct map_lookup *map; | ||
4263 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
4264 | int ret = 0; | ||
4265 | |||
4266 | read_lock(&em_tree->lock); | ||
4267 | em = lookup_extent_mapping(em_tree, logical, len); | ||
4268 | read_unlock(&em_tree->lock); | ||
4269 | BUG_ON(!em); | ||
4270 | |||
4271 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
4272 | map = (struct map_lookup *)em->bdev; | ||
4273 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4274 | BTRFS_BLOCK_GROUP_RAID6)) | ||
4275 | ret = 1; | ||
4276 | free_extent_map(em); | ||
4277 | return ret; | ||
4278 | } | ||
4279 | |||
4151 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4280 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
4152 | struct map_lookup *map, int first, int num, | 4281 | struct map_lookup *map, int first, int num, |
4153 | int optimal, int dev_replace_is_ongoing) | 4282 | int optimal, int dev_replace_is_ongoing) |
@@ -4185,10 +4314,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
4185 | return optimal; | 4314 | return optimal; |
4186 | } | 4315 | } |
4187 | 4316 | ||
4317 | static inline int parity_smaller(u64 a, u64 b) | ||
4318 | { | ||
4319 | return a > b; | ||
4320 | } | ||
4321 | |||
4322 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
4323 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
4324 | { | ||
4325 | struct btrfs_bio_stripe s; | ||
4326 | int i; | ||
4327 | u64 l; | ||
4328 | int again = 1; | ||
4329 | |||
4330 | while (again) { | ||
4331 | again = 0; | ||
4332 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
4333 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
4334 | s = bbio->stripes[i]; | ||
4335 | l = raid_map[i]; | ||
4336 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
4337 | raid_map[i] = raid_map[i+1]; | ||
4338 | bbio->stripes[i+1] = s; | ||
4339 | raid_map[i+1] = l; | ||
4340 | again = 1; | ||
4341 | } | ||
4342 | } | ||
4343 | } | ||
4344 | } | ||
4345 | |||
4188 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4346 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
4189 | u64 logical, u64 *length, | 4347 | u64 logical, u64 *length, |
4190 | struct btrfs_bio **bbio_ret, | 4348 | struct btrfs_bio **bbio_ret, |
4191 | int mirror_num) | 4349 | int mirror_num, u64 **raid_map_ret) |
4192 | { | 4350 | { |
4193 | struct extent_map *em; | 4351 | struct extent_map *em; |
4194 | struct map_lookup *map; | 4352 | struct map_lookup *map; |
@@ -4200,6 +4358,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4200 | u64 stripe_nr; | 4358 | u64 stripe_nr; |
4201 | u64 stripe_nr_orig; | 4359 | u64 stripe_nr_orig; |
4202 | u64 stripe_nr_end; | 4360 | u64 stripe_nr_end; |
4361 | u64 stripe_len; | ||
4362 | u64 *raid_map = NULL; | ||
4203 | int stripe_index; | 4363 | int stripe_index; |
4204 | int i; | 4364 | int i; |
4205 | int ret = 0; | 4365 | int ret = 0; |
@@ -4211,6 +4371,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4211 | int num_alloc_stripes; | 4371 | int num_alloc_stripes; |
4212 | int patch_the_first_stripe_for_dev_replace = 0; | 4372 | int patch_the_first_stripe_for_dev_replace = 0; |
4213 | u64 physical_to_patch_in_first_stripe = 0; | 4373 | u64 physical_to_patch_in_first_stripe = 0; |
4374 | u64 raid56_full_stripe_start = (u64)-1; | ||
4214 | 4375 | ||
4215 | read_lock(&em_tree->lock); | 4376 | read_lock(&em_tree->lock); |
4216 | em = lookup_extent_mapping(em_tree, logical, *length); | 4377 | em = lookup_extent_mapping(em_tree, logical, *length); |
@@ -4227,29 +4388,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4227 | map = (struct map_lookup *)em->bdev; | 4388 | map = (struct map_lookup *)em->bdev; |
4228 | offset = logical - em->start; | 4389 | offset = logical - em->start; |
4229 | 4390 | ||
4391 | if (mirror_num > map->num_stripes) | ||
4392 | mirror_num = 0; | ||
4393 | |||
4394 | stripe_len = map->stripe_len; | ||
4230 | stripe_nr = offset; | 4395 | stripe_nr = offset; |
4231 | /* | 4396 | /* |
4232 | * stripe_nr counts the total number of stripes we have to stride | 4397 | * stripe_nr counts the total number of stripes we have to stride |
4233 | * to get to this block | 4398 | * to get to this block |
4234 | */ | 4399 | */ |
4235 | do_div(stripe_nr, map->stripe_len); | 4400 | do_div(stripe_nr, stripe_len); |
4236 | 4401 | ||
4237 | stripe_offset = stripe_nr * map->stripe_len; | 4402 | stripe_offset = stripe_nr * stripe_len; |
4238 | BUG_ON(offset < stripe_offset); | 4403 | BUG_ON(offset < stripe_offset); |
4239 | 4404 | ||
4240 | /* stripe_offset is the offset of this block in its stripe*/ | 4405 | /* stripe_offset is the offset of this block in its stripe*/ |
4241 | stripe_offset = offset - stripe_offset; | 4406 | stripe_offset = offset - stripe_offset; |
4242 | 4407 | ||
4243 | if (rw & REQ_DISCARD) | 4408 | /* if we're here for raid56, we need to know the stripe aligned start */ |
4409 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4410 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
4411 | raid56_full_stripe_start = offset; | ||
4412 | |||
4413 | /* allow a write of a full stripe, but make sure we don't | ||
4414 | * allow straddling of stripes | ||
4415 | */ | ||
4416 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
4417 | raid56_full_stripe_start *= full_stripe_len; | ||
4418 | } | ||
4419 | |||
4420 | if (rw & REQ_DISCARD) { | ||
4421 | /* we don't discard raid56 yet */ | ||
4422 | if (map->type & | ||
4423 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4424 | ret = -EOPNOTSUPP; | ||
4425 | goto out; | ||
4426 | } | ||
4244 | *length = min_t(u64, em->len - offset, *length); | 4427 | *length = min_t(u64, em->len - offset, *length); |
4245 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4428 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
4246 | /* we limit the length of each bio to what fits in a stripe */ | 4429 | u64 max_len; |
4247 | *length = min_t(u64, em->len - offset, | 4430 | /* For writes to RAID[56], allow a full stripeset across all disks. |
4248 | map->stripe_len - stripe_offset); | 4431 | For other RAID types and for RAID[56] reads, just allow a single |
4432 | stripe (on a single disk). */ | ||
4433 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
4434 | (rw & REQ_WRITE)) { | ||
4435 | max_len = stripe_len * nr_data_stripes(map) - | ||
4436 | (offset - raid56_full_stripe_start); | ||
4437 | } else { | ||
4438 | /* we limit the length of each bio to what fits in a stripe */ | ||
4439 | max_len = stripe_len - stripe_offset; | ||
4440 | } | ||
4441 | *length = min_t(u64, em->len - offset, max_len); | ||
4249 | } else { | 4442 | } else { |
4250 | *length = em->len - offset; | 4443 | *length = em->len - offset; |
4251 | } | 4444 | } |
4252 | 4445 | ||
4446 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
4447 | it cares about is the length */ | ||
4253 | if (!bbio_ret) | 4448 | if (!bbio_ret) |
4254 | goto out; | 4449 | goto out; |
4255 | 4450 | ||
@@ -4282,7 +4477,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4282 | u64 physical_of_found = 0; | 4477 | u64 physical_of_found = 0; |
4283 | 4478 | ||
4284 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4479 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
4285 | logical, &tmp_length, &tmp_bbio, 0); | 4480 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
4286 | if (ret) { | 4481 | if (ret) { |
4287 | WARN_ON(tmp_bbio != NULL); | 4482 | WARN_ON(tmp_bbio != NULL); |
4288 | goto out; | 4483 | goto out; |
@@ -4348,6 +4543,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4348 | do_div(stripe_nr_end, map->stripe_len); | 4543 | do_div(stripe_nr_end, map->stripe_len); |
4349 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4544 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
4350 | (offset + *length); | 4545 | (offset + *length); |
4546 | |||
4351 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4547 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4352 | if (rw & REQ_DISCARD) | 4548 | if (rw & REQ_DISCARD) |
4353 | num_stripes = min_t(u64, map->num_stripes, | 4549 | num_stripes = min_t(u64, map->num_stripes, |
@@ -4398,6 +4594,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4398 | dev_replace_is_ongoing); | 4594 | dev_replace_is_ongoing); |
4399 | mirror_num = stripe_index - old_stripe_index + 1; | 4595 | mirror_num = stripe_index - old_stripe_index + 1; |
4400 | } | 4596 | } |
4597 | |||
4598 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4599 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4600 | u64 tmp; | ||
4601 | |||
4602 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
4603 | && raid_map_ret) { | ||
4604 | int i, rot; | ||
4605 | |||
4606 | /* push stripe_nr back to the start of the full stripe */ | ||
4607 | stripe_nr = raid56_full_stripe_start; | ||
4608 | do_div(stripe_nr, stripe_len); | ||
4609 | |||
4610 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4611 | |||
4612 | /* RAID[56] write or recovery. Return all stripes */ | ||
4613 | num_stripes = map->num_stripes; | ||
4614 | max_errors = nr_parity_stripes(map); | ||
4615 | |||
4616 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
4617 | GFP_NOFS); | ||
4618 | if (!raid_map) { | ||
4619 | ret = -ENOMEM; | ||
4620 | goto out; | ||
4621 | } | ||
4622 | |||
4623 | /* Work out the disk rotation on this stripe-set */ | ||
4624 | tmp = stripe_nr; | ||
4625 | rot = do_div(tmp, num_stripes); | ||
4626 | |||
4627 | /* Fill in the logical address of each stripe */ | ||
4628 | tmp = stripe_nr * nr_data_stripes(map); | ||
4629 | for (i = 0; i < nr_data_stripes(map); i++) | ||
4630 | raid_map[(i+rot) % num_stripes] = | ||
4631 | em->start + (tmp + i) * map->stripe_len; | ||
4632 | |||
4633 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
4634 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
4635 | raid_map[(i+rot+1) % num_stripes] = | ||
4636 | RAID6_Q_STRIPE; | ||
4637 | |||
4638 | *length = map->stripe_len; | ||
4639 | stripe_index = 0; | ||
4640 | stripe_offset = 0; | ||
4641 | } else { | ||
4642 | /* | ||
4643 | * Mirror #0 or #1 means the original data block. | ||
4644 | * Mirror #2 is RAID5 parity block. | ||
4645 | * Mirror #3 is RAID6 Q block. | ||
4646 | */ | ||
4647 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
4648 | if (mirror_num > 1) | ||
4649 | stripe_index = nr_data_stripes(map) + | ||
4650 | mirror_num - 2; | ||
4651 | |||
4652 | /* We distribute the parity blocks across stripes */ | ||
4653 | tmp = stripe_nr + stripe_index; | ||
4654 | stripe_index = do_div(tmp, map->num_stripes); | ||
4655 | } | ||
4401 | } else { | 4656 | } else { |
4402 | /* | 4657 | /* |
4403 | * after this do_div call, stripe_nr is the number of stripes | 4658 | * after this do_div call, stripe_nr is the number of stripes |
@@ -4506,8 +4761,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4506 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4761 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
4507 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4762 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
4508 | BTRFS_BLOCK_GROUP_RAID10 | | 4763 | BTRFS_BLOCK_GROUP_RAID10 | |
4764 | BTRFS_BLOCK_GROUP_RAID5 | | ||
4509 | BTRFS_BLOCK_GROUP_DUP)) { | 4765 | BTRFS_BLOCK_GROUP_DUP)) { |
4510 | max_errors = 1; | 4766 | max_errors = 1; |
4767 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
4768 | max_errors = 2; | ||
4511 | } | 4769 | } |
4512 | } | 4770 | } |
4513 | 4771 | ||
@@ -4608,6 +4866,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4608 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4866 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
4609 | bbio->mirror_num = map->num_stripes + 1; | 4867 | bbio->mirror_num = map->num_stripes + 1; |
4610 | } | 4868 | } |
4869 | if (raid_map) { | ||
4870 | sort_parity_stripes(bbio, raid_map); | ||
4871 | *raid_map_ret = raid_map; | ||
4872 | } | ||
4611 | out: | 4873 | out: |
4612 | if (dev_replace_is_ongoing) | 4874 | if (dev_replace_is_ongoing) |
4613 | btrfs_dev_replace_unlock(dev_replace); | 4875 | btrfs_dev_replace_unlock(dev_replace); |
@@ -4620,7 +4882,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4620 | struct btrfs_bio **bbio_ret, int mirror_num) | 4882 | struct btrfs_bio **bbio_ret, int mirror_num) |
4621 | { | 4883 | { |
4622 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4884 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
4623 | mirror_num); | 4885 | mirror_num, NULL); |
4624 | } | 4886 | } |
4625 | 4887 | ||
4626 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4888 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
@@ -4634,6 +4896,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4634 | u64 bytenr; | 4896 | u64 bytenr; |
4635 | u64 length; | 4897 | u64 length; |
4636 | u64 stripe_nr; | 4898 | u64 stripe_nr; |
4899 | u64 rmap_len; | ||
4637 | int i, j, nr = 0; | 4900 | int i, j, nr = 0; |
4638 | 4901 | ||
4639 | read_lock(&em_tree->lock); | 4902 | read_lock(&em_tree->lock); |
@@ -4644,10 +4907,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4644 | map = (struct map_lookup *)em->bdev; | 4907 | map = (struct map_lookup *)em->bdev; |
4645 | 4908 | ||
4646 | length = em->len; | 4909 | length = em->len; |
4910 | rmap_len = map->stripe_len; | ||
4911 | |||
4647 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4912 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
4648 | do_div(length, map->num_stripes / map->sub_stripes); | 4913 | do_div(length, map->num_stripes / map->sub_stripes); |
4649 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4914 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
4650 | do_div(length, map->num_stripes); | 4915 | do_div(length, map->num_stripes); |
4916 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
4917 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
4918 | do_div(length, nr_data_stripes(map)); | ||
4919 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
4920 | } | ||
4651 | 4921 | ||
4652 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4922 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
4653 | BUG_ON(!buf); /* -ENOMEM */ | 4923 | BUG_ON(!buf); /* -ENOMEM */ |
@@ -4667,8 +4937,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4667 | do_div(stripe_nr, map->sub_stripes); | 4937 | do_div(stripe_nr, map->sub_stripes); |
4668 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4938 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
4669 | stripe_nr = stripe_nr * map->num_stripes + i; | 4939 | stripe_nr = stripe_nr * map->num_stripes + i; |
4670 | } | 4940 | } /* else if RAID[56], multiply by nr_data_stripes(). |
4671 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4941 | * Alternatively, just use rmap_len below instead of |
4942 | * map->stripe_len */ | ||
4943 | |||
4944 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
4672 | WARN_ON(nr >= map->num_stripes); | 4945 | WARN_ON(nr >= map->num_stripes); |
4673 | for (j = 0; j < nr; j++) { | 4946 | for (j = 0; j < nr; j++) { |
4674 | if (buf[j] == bytenr) | 4947 | if (buf[j] == bytenr) |
@@ -4682,7 +4955,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
4682 | 4955 | ||
4683 | *logical = buf; | 4956 | *logical = buf; |
4684 | *naddrs = nr; | 4957 | *naddrs = nr; |
4685 | *stripe_len = map->stripe_len; | 4958 | *stripe_len = rmap_len; |
4686 | 4959 | ||
4687 | free_extent_map(em); | 4960 | free_extent_map(em); |
4688 | return 0; | 4961 | return 0; |
@@ -4756,7 +5029,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
4756 | bio->bi_bdev = (struct block_device *) | 5029 | bio->bi_bdev = (struct block_device *) |
4757 | (unsigned long)bbio->mirror_num; | 5030 | (unsigned long)bbio->mirror_num; |
4758 | /* only send an error to the higher layers if it is | 5031 | /* only send an error to the higher layers if it is |
4759 | * beyond the tolerance of the multi-bio | 5032 | * beyond the tolerance of the btrfs bio |
4760 | */ | 5033 | */ |
4761 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 5034 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
4762 | err = -EIO; | 5035 | err = -EIO; |
@@ -4790,13 +5063,18 @@ struct async_sched { | |||
4790 | * This will add one bio to the pending list for a device and make sure | 5063 | * This will add one bio to the pending list for a device and make sure |
4791 | * the work struct is scheduled. | 5064 | * the work struct is scheduled. |
4792 | */ | 5065 | */ |
4793 | static noinline void schedule_bio(struct btrfs_root *root, | 5066 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
4794 | struct btrfs_device *device, | 5067 | struct btrfs_device *device, |
4795 | int rw, struct bio *bio) | 5068 | int rw, struct bio *bio) |
4796 | { | 5069 | { |
4797 | int should_queue = 1; | 5070 | int should_queue = 1; |
4798 | struct btrfs_pending_bios *pending_bios; | 5071 | struct btrfs_pending_bios *pending_bios; |
4799 | 5072 | ||
5073 | if (device->missing || !device->bdev) { | ||
5074 | bio_endio(bio, -EIO); | ||
5075 | return; | ||
5076 | } | ||
5077 | |||
4800 | /* don't bother with additional async steps for reads, right now */ | 5078 | /* don't bother with additional async steps for reads, right now */ |
4801 | if (!(rw & REQ_WRITE)) { | 5079 | if (!(rw & REQ_WRITE)) { |
4802 | bio_get(bio); | 5080 | bio_get(bio); |
@@ -4894,7 +5172,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
4894 | #endif | 5172 | #endif |
4895 | bio->bi_bdev = dev->bdev; | 5173 | bio->bi_bdev = dev->bdev; |
4896 | if (async) | 5174 | if (async) |
4897 | schedule_bio(root, dev, rw, bio); | 5175 | btrfs_schedule_bio(root, dev, rw, bio); |
4898 | else | 5176 | else |
4899 | btrfsic_submit_bio(rw, bio); | 5177 | btrfsic_submit_bio(rw, bio); |
4900 | } | 5178 | } |
@@ -4953,6 +5231,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4953 | u64 logical = (u64)bio->bi_sector << 9; | 5231 | u64 logical = (u64)bio->bi_sector << 9; |
4954 | u64 length = 0; | 5232 | u64 length = 0; |
4955 | u64 map_length; | 5233 | u64 map_length; |
5234 | u64 *raid_map = NULL; | ||
4956 | int ret; | 5235 | int ret; |
4957 | int dev_nr = 0; | 5236 | int dev_nr = 0; |
4958 | int total_devs = 1; | 5237 | int total_devs = 1; |
@@ -4961,12 +5240,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4961 | length = bio->bi_size; | 5240 | length = bio->bi_size; |
4962 | map_length = length; | 5241 | map_length = length; |
4963 | 5242 | ||
4964 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5243 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
4965 | mirror_num); | 5244 | mirror_num, &raid_map); |
4966 | if (ret) | 5245 | if (ret) /* -ENOMEM */ |
4967 | return ret; | 5246 | return ret; |
4968 | 5247 | ||
4969 | total_devs = bbio->num_stripes; | 5248 | total_devs = bbio->num_stripes; |
5249 | bbio->orig_bio = first_bio; | ||
5250 | bbio->private = first_bio->bi_private; | ||
5251 | bbio->end_io = first_bio->bi_end_io; | ||
5252 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
5253 | |||
5254 | if (raid_map) { | ||
5255 | /* In this case, map_length has been set to the length of | ||
5256 | a single stripe; not the whole write */ | ||
5257 | if (rw & WRITE) { | ||
5258 | return raid56_parity_write(root, bio, bbio, | ||
5259 | raid_map, map_length); | ||
5260 | } else { | ||
5261 | return raid56_parity_recover(root, bio, bbio, | ||
5262 | raid_map, map_length, | ||
5263 | mirror_num); | ||
5264 | } | ||
5265 | } | ||
5266 | |||
4970 | if (map_length < length) { | 5267 | if (map_length < length) { |
4971 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5268 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
4972 | "len %llu\n", (unsigned long long)logical, | 5269 | "len %llu\n", (unsigned long long)logical, |
@@ -4975,11 +5272,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
4975 | BUG(); | 5272 | BUG(); |
4976 | } | 5273 | } |
4977 | 5274 | ||
4978 | bbio->orig_bio = first_bio; | ||
4979 | bbio->private = first_bio->bi_private; | ||
4980 | bbio->end_io = first_bio->bi_end_io; | ||
4981 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
4982 | |||
4983 | while (dev_nr < total_devs) { | 5275 | while (dev_nr < total_devs) { |
4984 | dev = bbio->stripes[dev_nr].dev; | 5276 | dev = bbio->stripes[dev_nr].dev; |
4985 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5277 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 12bb84166a5f..062d8604d35b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | 321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, |
322 | struct btrfs_device *tgtdev); | 322 | struct btrfs_device *tgtdev); |
323 | int btrfs_scratch_superblock(struct btrfs_device *device); | 323 | int btrfs_scratch_superblock(struct btrfs_device *device); |
324 | 324 | void btrfs_schedule_bio(struct btrfs_root *root, | |
325 | struct btrfs_device *device, | ||
326 | int rw, struct bio *bio); | ||
327 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
328 | u64 logical, u64 len, int mirror_num); | ||
329 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
330 | struct btrfs_mapping_tree *map_tree, | ||
331 | u64 logical); | ||
325 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, | 332 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, |
326 | int index) | 333 | int index) |
327 | { | 334 | { |