aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2013-01-29 18:40:14 -0500
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:24:23 -0500
commit53b381b3abeb86f12787a6c40fee9b2f71edc23b (patch)
treec1018ba2157778f0200d2ede0c0df48fe5df8f14 /fs/btrfs/extent-tree.c
parent64a167011bcabc1e855658387c8a4464b71f3138 (diff)
Btrfs: RAID5 and RAID6
This builds on David Woodhouse's original Btrfs raid5/6 implementation. The code has changed quite a bit, blame Chris Mason for any bugs. Read/modify/write is done after the higher levels of the filesystem have prepared a given bio. This means the higher layers are not responsible for building full stripes, and they don't need to query for the topology of the extents that may get allocated during delayed allocation runs. It also means different files can easily share the same stripe. But, it does expose us to incorrect parity if we crash or lose power while doing a read/modify/write cycle. This will be addressed in a later commit. Scrub is unable to repair crc errors on raid5/6 chunks. Discard does not work on raid5/6 (yet) The stripe size is fixed at 64KiB per disk. This will be tunable in a later commit. Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c88
1 files changed, 59 insertions, 29 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d133edfcd449..3345f68fc64b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -3276,6 +3279,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3279 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3280 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3281 u64 target;
3282 u64 tmp;
3279 3283
3280 /* 3284 /*
3281 * see if restripe for this chunk_type is in progress, if so 3285 * see if restripe for this chunk_type is in progress, if so
@@ -3292,30 +3296,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3296 }
3293 spin_unlock(&root->fs_info->balance_lock); 3297 spin_unlock(&root->fs_info->balance_lock);
3294 3298
3299 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3300 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3301 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3302 BTRFS_BLOCK_GROUP_RAID5);
3303 if (num_devices < 3)
3304 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3305 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3306 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3307
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3308 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3309 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3310 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3311 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3312
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3313 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3314 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3315 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3316 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3317 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3318 tmp = BTRFS_BLOCK_GROUP_RAID10;
3319 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3320 tmp = BTRFS_BLOCK_GROUP_RAID1;
3321 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3322 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3323
3318 return extended_to_chunk(flags); 3324 return extended_to_chunk(flags | tmp);
3319} 3325}
3320 3326
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3327static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3333,6 +3339,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3339u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3340{
3335 u64 flags; 3341 u64 flags;
3342 u64 ret;
3336 3343
3337 if (data) 3344 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3345 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3348,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3348 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3349 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3350
3344 return get_alloc_profile(root, flags); 3351 ret = get_alloc_profile(root, flags);
3352 return ret;
3345} 3353}
3346 3354
3347/* 3355/*
@@ -3516,8 +3524,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3524{
3517 u64 num_dev; 3525 u64 num_dev;
3518 3526
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3527 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3528 BTRFS_BLOCK_GROUP_RAID0 |
3529 BTRFS_BLOCK_GROUP_RAID5 |
3530 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3531 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3532 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3533 num_dev = 2;
@@ -3667,7 +3677,9 @@ static int can_overcommit(struct btrfs_root *root,
3667 3677
3668 /* 3678 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3679 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3680 * space is actually useable. For raid56, the space info used
3681 * doesn't include the parity drive, so we don't have to
3682 * change the math
3671 */ 3683 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3684 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3685 BTRFS_BLOCK_GROUP_RAID1 |
@@ -5455,10 +5467,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5455 return ret; 5467 return ret;
5456} 5468}
5457 5469
5458static u64 stripe_align(struct btrfs_root *root, u64 val) 5470static u64 stripe_align(struct btrfs_root *root,
5471 struct btrfs_block_group_cache *cache,
5472 u64 val, u64 num_bytes)
5459{ 5473{
5460 u64 mask = ((u64)root->stripesize - 1); 5474 u64 mask;
5461 u64 ret = (val + mask) & ~mask; 5475 u64 ret;
5476 mask = ((u64)root->stripesize - 1);
5477 ret = (val + mask) & ~mask;
5462 return ret; 5478 return ret;
5463} 5479}
5464 5480
@@ -5519,9 +5535,12 @@ int __get_raid_index(u64 flags)
5519 index = 2; 5535 index = 2;
5520 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5536 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5521 index = 3; 5537 index = 3;
5538 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5539 index = 5;
5540 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5541 index = 6;
5522 else 5542 else
5523 index = 4; 5543 index = 4; /* BTRFS_BLOCK_GROUP_SINGLE */
5524
5525 return index; 5544 return index;
5526} 5545}
5527 5546
@@ -5665,6 +5684,8 @@ search:
5665 if (!block_group_bits(block_group, data)) { 5684 if (!block_group_bits(block_group, data)) {
5666 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5685 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5667 BTRFS_BLOCK_GROUP_RAID1 | 5686 BTRFS_BLOCK_GROUP_RAID1 |
5687 BTRFS_BLOCK_GROUP_RAID5 |
5688 BTRFS_BLOCK_GROUP_RAID6 |
5668 BTRFS_BLOCK_GROUP_RAID10; 5689 BTRFS_BLOCK_GROUP_RAID10;
5669 5690
5670 /* 5691 /*
@@ -5835,7 +5856,8 @@ unclustered_alloc:
5835 goto loop; 5856 goto loop;
5836 } 5857 }
5837checks: 5858checks:
5838 search_start = stripe_align(root, offset); 5859 search_start = stripe_align(root, used_block_group,
5860 offset, num_bytes);
5839 5861
5840 /* move on to the next group */ 5862 /* move on to the next group */
5841 if (search_start + num_bytes > 5863 if (search_start + num_bytes >
@@ -7203,6 +7225,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7203 root->fs_info->fs_devices->missing_devices; 7225 root->fs_info->fs_devices->missing_devices;
7204 7226
7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7227 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7228 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7206 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7229 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7207 7230
7208 if (num_devices == 1) { 7231 if (num_devices == 1) {
@@ -7754,7 +7777,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7754 btrfs_release_path(path); 7777 btrfs_release_path(path);
7755 cache->flags = btrfs_block_group_flags(&cache->item); 7778 cache->flags = btrfs_block_group_flags(&cache->item);
7756 cache->sectorsize = root->sectorsize; 7779 cache->sectorsize = root->sectorsize;
7757 7780 cache->full_stripe_len = btrfs_full_stripe_len(root,
7781 &root->fs_info->mapping_tree,
7782 found_key.objectid);
7758 btrfs_init_free_space_ctl(cache); 7783 btrfs_init_free_space_ctl(cache);
7759 7784
7760 /* 7785 /*
@@ -7808,6 +7833,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7808 if (!(get_alloc_profile(root, space_info->flags) & 7833 if (!(get_alloc_profile(root, space_info->flags) &
7809 (BTRFS_BLOCK_GROUP_RAID10 | 7834 (BTRFS_BLOCK_GROUP_RAID10 |
7810 BTRFS_BLOCK_GROUP_RAID1 | 7835 BTRFS_BLOCK_GROUP_RAID1 |
7836 BTRFS_BLOCK_GROUP_RAID5 |
7837 BTRFS_BLOCK_GROUP_RAID6 |
7811 BTRFS_BLOCK_GROUP_DUP))) 7838 BTRFS_BLOCK_GROUP_DUP)))
7812 continue; 7839 continue;
7813 /* 7840 /*
@@ -7883,6 +7910,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7910 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7884 cache->sectorsize = root->sectorsize; 7911 cache->sectorsize = root->sectorsize;
7885 cache->fs_info = root->fs_info; 7912 cache->fs_info = root->fs_info;
7913 cache->full_stripe_len = btrfs_full_stripe_len(root,
7914 &root->fs_info->mapping_tree,
7915 chunk_offset);
7886 7916
7887 atomic_set(&cache->count, 1); 7917 atomic_set(&cache->count, 1);
7888 spin_lock_init(&cache->lock); 7918 spin_lock_init(&cache->lock);