aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/disk-io.c
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2013-01-29 18:40:14 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit53b381b3abeb86f12787a6c40fee9b2f71edc23b (patch)
treec1018ba2157778f0200d2ede0c0df48fe5df8f14 /fs/btrfs/disk-io.c
parent64a167011bcabc1e855658387c8a4464b71f3138 (diff)
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/disk-io.c')
-rw-r--r--fs/btrfs/disk-io.c62
1 files changed, 53 insertions, 9 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 65f03670a952..e9fa7b4d18e3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
46#include "check-integrity.h" 46#include "check-integrity.h"
47#include "rcu-string.h" 47#include "rcu-string.h"
48#include "dev-replace.h" 48#include "dev-replace.h"
49#include "raid56.h"
49 50
50#ifdef CONFIG_X86 51#ifdef CONFIG_X86
51#include <asm/cpufeature.h> 52#include <asm/cpufeature.h>
@@ -639,8 +640,15 @@ err:
639 btree_readahead_hook(root, eb, eb->start, ret); 640 btree_readahead_hook(root, eb, eb->start, ret);
640 } 641 }
641 642
642 if (ret) 643 if (ret) {
644 /*
645 * our io error hook is going to dec the io pages
646 * again, we have to make sure it has something
647 * to decrement
648 */
649 atomic_inc(&eb->io_pages);
643 clear_extent_buffer_uptodate(eb); 650 clear_extent_buffer_uptodate(eb);
651 }
644 free_extent_buffer(eb); 652 free_extent_buffer(eb);
645out: 653out:
646 return ret; 654 return ret;
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
654 eb = (struct extent_buffer *)page->private; 662 eb = (struct extent_buffer *)page->private;
655 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 663 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
656 eb->read_mirror = failed_mirror; 664 eb->read_mirror = failed_mirror;
665 atomic_dec(&eb->io_pages);
657 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 666 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
658 btree_readahead_hook(root, eb, eb->start, -EIO); 667 btree_readahead_hook(root, eb, eb->start, -EIO);
659 return -EIO; /* we fixed nothing */ 668 return -EIO; /* we fixed nothing */
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
670 end_io_wq->work.flags = 0; 679 end_io_wq->work.flags = 0;
671 680
672 if (bio->bi_rw & REQ_WRITE) { 681 if (bio->bi_rw & REQ_WRITE) {
673 if (end_io_wq->metadata == 1) 682 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
674 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 683 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
675 &end_io_wq->work); 684 &end_io_wq->work);
676 else if (end_io_wq->metadata == 2) 685 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
677 btrfs_queue_worker(&fs_info->endio_freespace_worker, 686 btrfs_queue_worker(&fs_info->endio_freespace_worker,
678 &end_io_wq->work); 687 &end_io_wq->work);
688 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
689 btrfs_queue_worker(&fs_info->endio_raid56_workers,
690 &end_io_wq->work);
679 else 691 else
680 btrfs_queue_worker(&fs_info->endio_write_workers, 692 btrfs_queue_worker(&fs_info->endio_write_workers,
681 &end_io_wq->work); 693 &end_io_wq->work);
682 } else { 694 } else {
683 if (end_io_wq->metadata) 695 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
696 btrfs_queue_worker(&fs_info->endio_raid56_workers,
697 &end_io_wq->work);
698 else if (end_io_wq->metadata)
684 btrfs_queue_worker(&fs_info->endio_meta_workers, 699 btrfs_queue_worker(&fs_info->endio_meta_workers,
685 &end_io_wq->work); 700 &end_io_wq->work);
686 else 701 else
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
695 * 0 - if data 710 * 0 - if data
696 * 1 - if normal metadta 711 * 1 - if normal metadta
697 * 2 - if writing to the free space cache area 712 * 2 - if writing to the free space cache area
713 * 3 - raid parity work
698 */ 714 */
699int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 715int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
700 int metadata) 716 int metadata)
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
2165 init_waitqueue_head(&fs_info->transaction_blocked_wait); 2181 init_waitqueue_head(&fs_info->transaction_blocked_wait);
2166 init_waitqueue_head(&fs_info->async_submit_wait); 2182 init_waitqueue_head(&fs_info->async_submit_wait);
2167 2183
2184 ret = btrfs_alloc_stripe_hash_table(fs_info);
2185 if (ret) {
2186 err = -ENOMEM;
2187 goto fail_alloc;
2188 }
2189
2168 __setup_root(4096, 4096, 4096, 4096, tree_root, 2190 __setup_root(4096, 4096, 4096, 4096, tree_root,
2169 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2191 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2170 2192
@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
2332 btrfs_init_workers(&fs_info->endio_meta_write_workers, 2354 btrfs_init_workers(&fs_info->endio_meta_write_workers,
2333 "endio-meta-write", fs_info->thread_pool_size, 2355 "endio-meta-write", fs_info->thread_pool_size,
2334 &fs_info->generic_worker); 2356 &fs_info->generic_worker);
2357 btrfs_init_workers(&fs_info->endio_raid56_workers,
2358 "endio-raid56", fs_info->thread_pool_size,
2359 &fs_info->generic_worker);
2360 btrfs_init_workers(&fs_info->rmw_workers,
2361 "rmw", fs_info->thread_pool_size,
2362 &fs_info->generic_worker);
2335 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 2363 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
2336 fs_info->thread_pool_size, 2364 fs_info->thread_pool_size,
2337 &fs_info->generic_worker); 2365 &fs_info->generic_worker);
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
2350 */ 2378 */
2351 fs_info->endio_workers.idle_thresh = 4; 2379 fs_info->endio_workers.idle_thresh = 4;
2352 fs_info->endio_meta_workers.idle_thresh = 4; 2380 fs_info->endio_meta_workers.idle_thresh = 4;
2381 fs_info->endio_raid56_workers.idle_thresh = 4;
2382 fs_info->rmw_workers.idle_thresh = 2;
2353 2383
2354 fs_info->endio_write_workers.idle_thresh = 2; 2384 fs_info->endio_write_workers.idle_thresh = 2;
2355 fs_info->endio_meta_write_workers.idle_thresh = 2; 2385 fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
2366 ret |= btrfs_start_workers(&fs_info->fixup_workers); 2396 ret |= btrfs_start_workers(&fs_info->fixup_workers);
2367 ret |= btrfs_start_workers(&fs_info->endio_workers); 2397 ret |= btrfs_start_workers(&fs_info->endio_workers);
2368 ret |= btrfs_start_workers(&fs_info->endio_meta_workers); 2398 ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
2399 ret |= btrfs_start_workers(&fs_info->rmw_workers);
2400 ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
2369 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); 2401 ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
2370 ret |= btrfs_start_workers(&fs_info->endio_write_workers); 2402 ret |= btrfs_start_workers(&fs_info->endio_write_workers);
2371 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); 2403 ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2710,6 +2742,8 @@ fail_sb_buffer:
2710 btrfs_stop_workers(&fs_info->workers); 2742 btrfs_stop_workers(&fs_info->workers);
2711 btrfs_stop_workers(&fs_info->endio_workers); 2743 btrfs_stop_workers(&fs_info->endio_workers);
2712 btrfs_stop_workers(&fs_info->endio_meta_workers); 2744 btrfs_stop_workers(&fs_info->endio_meta_workers);
2745 btrfs_stop_workers(&fs_info->endio_raid56_workers);
2746 btrfs_stop_workers(&fs_info->rmw_workers);
2713 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2747 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2714 btrfs_stop_workers(&fs_info->endio_write_workers); 2748 btrfs_stop_workers(&fs_info->endio_write_workers);
2715 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2749 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2728,6 +2762,7 @@ fail_bdi:
2728fail_srcu: 2762fail_srcu:
2729 cleanup_srcu_struct(&fs_info->subvol_srcu); 2763 cleanup_srcu_struct(&fs_info->subvol_srcu);
2730fail: 2764fail:
2765 btrfs_free_stripe_hash_table(fs_info);
2731 btrfs_close_devices(fs_info->fs_devices); 2766 btrfs_close_devices(fs_info->fs_devices);
2732 return err; 2767 return err;
2733 2768
@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
3076 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) 3111 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
3077 == 0))) 3112 == 0)))
3078 num_tolerated_disk_barrier_failures = 0; 3113 num_tolerated_disk_barrier_failures = 0;
3079 else if (num_tolerated_disk_barrier_failures > 1 3114 else if (num_tolerated_disk_barrier_failures > 1) {
3080 && 3115 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3081 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3116 BTRFS_BLOCK_GROUP_RAID5 |
3082 BTRFS_BLOCK_GROUP_RAID10))) 3117 BTRFS_BLOCK_GROUP_RAID10)) {
3083 num_tolerated_disk_barrier_failures = 1; 3118 num_tolerated_disk_barrier_failures = 1;
3119 } else if (flags &
3120 BTRFS_BLOCK_GROUP_RAID5) {
3121 num_tolerated_disk_barrier_failures = 2;
3122 }
3123 }
3084 } 3124 }
3085 } 3125 }
3086 up_read(&sinfo->groups_sem); 3126 up_read(&sinfo->groups_sem);
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
3384 btrfs_stop_workers(&fs_info->workers); 3424 btrfs_stop_workers(&fs_info->workers);
3385 btrfs_stop_workers(&fs_info->endio_workers); 3425 btrfs_stop_workers(&fs_info->endio_workers);
3386 btrfs_stop_workers(&fs_info->endio_meta_workers); 3426 btrfs_stop_workers(&fs_info->endio_meta_workers);
3427 btrfs_stop_workers(&fs_info->endio_raid56_workers);
3428 btrfs_stop_workers(&fs_info->rmw_workers);
3387 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 3429 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
3388 btrfs_stop_workers(&fs_info->endio_write_workers); 3430 btrfs_stop_workers(&fs_info->endio_write_workers);
3389 btrfs_stop_workers(&fs_info->endio_freespace_worker); 3431 btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
3404 bdi_destroy(&fs_info->bdi); 3446 bdi_destroy(&fs_info->bdi);
3405 cleanup_srcu_struct(&fs_info->subvol_srcu); 3447 cleanup_srcu_struct(&fs_info->subvol_srcu);
3406 3448
3449 btrfs_free_stripe_hash_table(fs_info);
3450
3407 return 0; 3451 return 0;
3408} 3452}
3409 3453