aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c590
1 files changed, 391 insertions, 199 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5a3327b8f90d..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -72,8 +73,7 @@ enum {
72 RESERVE_ALLOC_NO_ACCOUNT = 2, 73 RESERVE_ALLOC_NO_ACCOUNT = 2,
73}; 74};
74 75
75static int update_block_group(struct btrfs_trans_handle *trans, 76static int update_block_group(struct btrfs_root *root,
76 struct btrfs_root *root,
77 u64 bytenr, u64 num_bytes, int alloc); 77 u64 bytenr, u64 num_bytes, int alloc);
78static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 78static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, 79 struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 int dump_block_groups); 103 int dump_block_groups);
104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 u64 num_bytes, int reserve); 105 u64 num_bytes, int reserve);
106static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 u64 num_bytes);
106 108
107static noinline int 109static noinline int
108block_group_cache_done(struct btrfs_block_group_cache *cache) 110block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
162 rb_link_node(&block_group->cache_node, parent, p); 164 rb_link_node(&block_group->cache_node, parent, p);
163 rb_insert_color(&block_group->cache_node, 165 rb_insert_color(&block_group->cache_node,
164 &info->block_group_cache_tree); 166 &info->block_group_cache_tree);
167
168 if (info->first_logical_byte > block_group->key.objectid)
169 info->first_logical_byte = block_group->key.objectid;
170
165 spin_unlock(&info->block_group_cache_lock); 171 spin_unlock(&info->block_group_cache_lock);
166 172
167 return 0; 173 return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
203 break; 209 break;
204 } 210 }
205 } 211 }
206 if (ret) 212 if (ret) {
207 btrfs_get_block_group(ret); 213 btrfs_get_block_group(ret);
214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215 info->first_logical_byte = ret->key.objectid;
216 }
208 spin_unlock(&info->block_group_cache_lock); 217 spin_unlock(&info->block_group_cache_lock);
209 218
210 return ret; 219 return ret;
@@ -468,8 +477,6 @@ out:
468} 477}
469 478
470static int cache_block_group(struct btrfs_block_group_cache *cache, 479static int cache_block_group(struct btrfs_block_group_cache *cache,
471 struct btrfs_trans_handle *trans,
472 struct btrfs_root *root,
473 int load_cache_only) 480 int load_cache_only)
474{ 481{
475 DEFINE_WAIT(wait); 482 DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
527 cache->cached = BTRFS_CACHE_FAST; 534 cache->cached = BTRFS_CACHE_FAST;
528 spin_unlock(&cache->lock); 535 spin_unlock(&cache->lock);
529 536
530 /*
531 * We can't do the read from on-disk cache during a commit since we need
532 * to have the normal tree locking. Also if we are currently trying to
533 * allocate blocks for the tree root we can't do the fast caching since
534 * we likely hold important locks.
535 */
536 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 537 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537 ret = load_free_space_cache(fs_info, cache); 538 ret = load_free_space_cache(fs_info, cache);
538 539
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2143 node->num_bytes); 2146 node->num_bytes);
2144 } 2147 }
2145 } 2148 }
2146 mutex_unlock(&head->mutex);
2147 return ret; 2149 return ret;
2148 } 2150 }
2149 2151
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2258 * process of being added. Don't run this ref yet. 2260 * process of being added. Don't run this ref yet.
2259 */ 2261 */
2260 list_del_init(&locked_ref->cluster); 2262 list_del_init(&locked_ref->cluster);
2261 mutex_unlock(&locked_ref->mutex); 2263 btrfs_delayed_ref_unlock(locked_ref);
2262 locked_ref = NULL; 2264 locked_ref = NULL;
2263 delayed_refs->num_heads_ready++; 2265 delayed_refs->num_heads_ready++;
2264 spin_unlock(&delayed_refs->lock); 2266 spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2285 ref = &locked_ref->node; 2287 ref = &locked_ref->node;
2286 2288
2287 if (extent_op && must_insert_reserved) { 2289 if (extent_op && must_insert_reserved) {
2288 kfree(extent_op); 2290 btrfs_free_delayed_extent_op(extent_op);
2289 extent_op = NULL; 2291 extent_op = NULL;
2290 } 2292 }
2291 2293
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2294 2296
2295 ret = run_delayed_extent_op(trans, root, 2297 ret = run_delayed_extent_op(trans, root,
2296 ref, extent_op); 2298 ref, extent_op);
2297 kfree(extent_op); 2299 btrfs_free_delayed_extent_op(extent_op);
2298 2300
2299 if (ret) { 2301 if (ret) {
2300 list_del_init(&locked_ref->cluster); 2302 printk(KERN_DEBUG
2301 mutex_unlock(&locked_ref->mutex); 2303 "btrfs: run_delayed_extent_op "
2302 2304 "returned %d\n", ret);
2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2304 spin_lock(&delayed_refs->lock); 2305 spin_lock(&delayed_refs->lock);
2306 btrfs_delayed_ref_unlock(locked_ref);
2305 return ret; 2307 return ret;
2306 } 2308 }
2307 2309
2308 goto next; 2310 goto next;
2309 } 2311 }
2310
2311 list_del_init(&locked_ref->cluster);
2312 locked_ref = NULL;
2313 } 2312 }
2314 2313
2315 ref->in_tree = 0; 2314 ref->in_tree = 0;
2316 rb_erase(&ref->rb_node, &delayed_refs->root); 2315 rb_erase(&ref->rb_node, &delayed_refs->root);
2317 delayed_refs->num_entries--; 2316 delayed_refs->num_entries--;
2318 if (locked_ref) { 2317 if (!btrfs_delayed_ref_is_head(ref)) {
2319 /* 2318 /*
2320 * when we play the delayed ref, also correct the 2319 * when we play the delayed ref, also correct the
2321 * ref_mod on head 2320 * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2336 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2338 must_insert_reserved); 2337 must_insert_reserved);
2339 2338
2340 btrfs_put_delayed_ref(ref); 2339 btrfs_free_delayed_extent_op(extent_op);
2341 kfree(extent_op);
2342 count++;
2343
2344 if (ret) { 2340 if (ret) {
2345 if (locked_ref) { 2341 btrfs_delayed_ref_unlock(locked_ref);
2346 list_del_init(&locked_ref->cluster); 2342 btrfs_put_delayed_ref(ref);
2347 mutex_unlock(&locked_ref->mutex); 2343 printk(KERN_DEBUG
2348 } 2344 "btrfs: run_one_delayed_ref returned %d\n", ret);
2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2350 spin_lock(&delayed_refs->lock); 2345 spin_lock(&delayed_refs->lock);
2351 return ret; 2346 return ret;
2352 } 2347 }
2353 2348
2349 /*
2350 * If this node is a head, that means all the refs in this head
2351 * have been dealt with, and we will pick the next head to deal
2352 * with, so we must unlock the head and drop it from the cluster
2353 * list before we release it.
2354 */
2355 if (btrfs_delayed_ref_is_head(ref)) {
2356 list_del_init(&locked_ref->cluster);
2357 btrfs_delayed_ref_unlock(locked_ref);
2358 locked_ref = NULL;
2359 }
2360 btrfs_put_delayed_ref(ref);
2361 count++;
2354next: 2362next:
2355 cond_resched(); 2363 cond_resched();
2356 spin_lock(&delayed_refs->lock); 2364 spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2435 return ret; 2443 return ret;
2436} 2444}
2437 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2438/* 2456/*
2439 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2440 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2469 2487
2470 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2471 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2472again: 2528again:
2473 loops = 0; 2529 loops = 0;
2474 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
2477 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2478#endif 2534#endif
2479 2535
2480 if (count == 0) {
2481 count = delayed_refs->num_entries * 2;
2482 run_most = 1;
2483 }
2484 while (1) { 2536 while (1) {
2485 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2486 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
2500 2552
2501 ret = run_clustered_refs(trans, root, &cluster); 2553 ret = run_clustered_refs(trans, root, &cluster);
2502 if (ret < 0) { 2554 if (ret < 0) {
2555 btrfs_release_ref_cluster(&cluster);
2503 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2504 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2505 return ret; 2559 return ret;
2506 } 2560 }
2507 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2508 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2509 2565
2510 if (count == 0) 2566 if (count == 0)
@@ -2573,6 +2629,11 @@ again:
2573 goto again; 2629 goto again;
2574 } 2630 }
2575out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2576 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2577 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2578 return 0; 2639 return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586 struct btrfs_delayed_extent_op *extent_op; 2647 struct btrfs_delayed_extent_op *extent_op;
2587 int ret; 2648 int ret;
2588 2649
2589 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2650 extent_op = btrfs_alloc_delayed_extent_op();
2590 if (!extent_op) 2651 if (!extent_op)
2591 return -ENOMEM; 2652 return -ENOMEM;
2592 2653
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2659 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2599 num_bytes, extent_op); 2660 num_bytes, extent_op);
2600 if (ret) 2661 if (ret)
2601 kfree(extent_op); 2662 btrfs_free_delayed_extent_op(extent_op);
2602 return ret; 2663 return ret;
2603} 2664}
2604 2665
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3223 u64 extra_flags = chunk_to_extended(flags) & 3284 u64 extra_flags = chunk_to_extended(flags) &
3224 BTRFS_EXTENDED_PROFILE_MASK; 3285 BTRFS_EXTENDED_PROFILE_MASK;
3225 3286
3287 write_seqlock(&fs_info->profiles_lock);
3226 if (flags & BTRFS_BLOCK_GROUP_DATA) 3288 if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 fs_info->avail_data_alloc_bits |= extra_flags; 3289 fs_info->avail_data_alloc_bits |= extra_flags;
3228 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3290 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3229 fs_info->avail_metadata_alloc_bits |= extra_flags; 3291 fs_info->avail_metadata_alloc_bits |= extra_flags;
3230 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3292 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3231 fs_info->avail_system_alloc_bits |= extra_flags; 3293 fs_info->avail_system_alloc_bits |= extra_flags;
3294 write_sequnlock(&fs_info->profiles_lock);
3232} 3295}
3233 3296
3234/* 3297/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3341 u64 target;
3342 u64 tmp;
3279 3343
3280 /* 3344 /*
3281 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3356 }
3293 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3294 3358
3359 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3360 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3365 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3367
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3372
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3383
3318 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3319} 3385}
3320 3386
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3322{ 3388{
3323 if (flags & BTRFS_BLOCK_GROUP_DATA) 3389 unsigned seq;
3324 flags |= root->fs_info->avail_data_alloc_bits; 3390
3325 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3391 do {
3326 flags |= root->fs_info->avail_system_alloc_bits; 3392 seq = read_seqbegin(&root->fs_info->profiles_lock);
3327 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3393
3328 flags |= root->fs_info->avail_metadata_alloc_bits; 3394 if (flags & BTRFS_BLOCK_GROUP_DATA)
3395 flags |= root->fs_info->avail_data_alloc_bits;
3396 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3397 flags |= root->fs_info->avail_system_alloc_bits;
3398 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3399 flags |= root->fs_info->avail_metadata_alloc_bits;
3400 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3329 3401
3330 return btrfs_reduce_alloc_profile(root, flags); 3402 return btrfs_reduce_alloc_profile(root, flags);
3331} 3403}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3406{
3335 u64 flags; 3407 u64 flags;
3408 u64 ret;
3336 3409
3337 if (data) 3410 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3414 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3416
3344 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3345} 3419}
3346 3420
3347/* 3421/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3357 int ret = 0, committed = 0, alloc_chunk = 1; 3431 int ret = 0, committed = 0, alloc_chunk = 1;
3358 3432
3359 /* make sure bytes are sectorsize aligned */ 3433 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3434 bytes = ALIGN(bytes, root->sectorsize);
3361 3435
3362 if (root == root->fs_info->tree_root || 3436 if (root == root->fs_info->tree_root ||
3363 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3437 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3452 struct btrfs_space_info *data_sinfo; 3526 struct btrfs_space_info *data_sinfo;
3453 3527
3454 /* make sure bytes are sectorsize aligned */ 3528 /* make sure bytes are sectorsize aligned */
3455 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3529 bytes = ALIGN(bytes, root->sectorsize);
3456 3530
3457 data_sinfo = root->fs_info->data_sinfo; 3531 data_sinfo = root->fs_info->data_sinfo;
3458 spin_lock(&data_sinfo->lock); 3532 spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3590{
3517 u64 num_dev; 3591 u64 num_dev;
3518 3592
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3599 num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3564 int wait_for_alloc = 0; 3640 int wait_for_alloc = 0;
3565 int ret = 0; 3641 int ret = 0;
3566 3642
3643 /* Don't re-enter if we're already allocating a chunk */
3644 if (trans->allocating_chunk)
3645 return -ENOSPC;
3646
3567 space_info = __find_space_info(extent_root->fs_info, flags); 3647 space_info = __find_space_info(extent_root->fs_info, flags);
3568 if (!space_info) { 3648 if (!space_info) {
3569 ret = update_space_info(extent_root->fs_info, flags, 3649 ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
3606 goto again; 3686 goto again;
3607 } 3687 }
3608 3688
3689 trans->allocating_chunk = true;
3690
3609 /* 3691 /*
3610 * If we have mixed data/metadata chunks we want to make sure we keep 3692 * If we have mixed data/metadata chunks we want to make sure we keep
3611 * allocating mixed chunks instead of individual chunks. 3693 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
3632 check_system_chunk(trans, extent_root, flags); 3714 check_system_chunk(trans, extent_root, flags);
3633 3715
3634 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3716 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3635 if (ret < 0 && ret != -ENOSPC) 3717 trans->allocating_chunk = false;
3636 goto out;
3637 3718
3638 spin_lock(&space_info->lock); 3719 spin_lock(&space_info->lock);
3720 if (ret < 0 && ret != -ENOSPC)
3721 goto out;
3639 if (ret) 3722 if (ret)
3640 space_info->full = 1; 3723 space_info->full = 1;
3641 else 3724 else
3642 ret = 1; 3725 ret = 1;
3643 3726
3644 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3727 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3728out:
3645 space_info->chunk_alloc = 0; 3729 space_info->chunk_alloc = 0;
3646 spin_unlock(&space_info->lock); 3730 spin_unlock(&space_info->lock);
3647out:
3648 mutex_unlock(&fs_info->chunk_mutex); 3731 mutex_unlock(&fs_info->chunk_mutex);
3649 return ret; 3732 return ret;
3650} 3733}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes, 3736 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush) 3737 enum btrfs_reserve_flush_enum flush)
3655{ 3738{
3739 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3656 u64 profile = btrfs_get_alloc_profile(root, 0); 3740 u64 profile = btrfs_get_alloc_profile(root, 0);
3741 u64 rsv_size = 0;
3657 u64 avail; 3742 u64 avail;
3658 u64 used; 3743 u64 used;
3744 u64 to_add;
3659 3745
3660 used = space_info->bytes_used + space_info->bytes_reserved + 3746 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly + 3747 space_info->bytes_pinned + space_info->bytes_readonly;
3662 space_info->bytes_may_use; 3748
3749 spin_lock(&global_rsv->lock);
3750 rsv_size = global_rsv->size;
3751 spin_unlock(&global_rsv->lock);
3752
3753 /*
3754 * We only want to allow over committing if we have lots of actual space
3755 * free, but if we don't have enough space to handle the global reserve
3756 * space then we could end up having a real enospc problem when trying
3757 * to allocate a chunk or some other such important allocation.
3758 */
3759 rsv_size <<= 1;
3760 if (used + rsv_size >= space_info->total_bytes)
3761 return 0;
3762
3763 used += space_info->bytes_may_use;
3663 3764
3664 spin_lock(&root->fs_info->free_chunk_lock); 3765 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space; 3766 avail = root->fs_info->free_chunk_space;
@@ -3667,40 +3768,58 @@ static int can_overcommit(struct btrfs_root *root,
3667 3768
3668 /* 3769 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3671 */ 3774 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10)) 3777 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1; 3778 avail >>= 1;
3676 3779
3780 to_add = space_info->total_bytes;
3781
3677 /* 3782 /*
3678 * If we aren't flushing all things, let us overcommit up to 3783 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit 3784 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space. 3785 * too much, let it overcommit up to 1/8 of the space.
3681 */ 3786 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3787 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3; 3788 to_add >>= 3;
3684 else 3789 else
3685 avail >>= 1; 3790 to_add >>= 1;
3791
3792 /*
3793 * Limit the overcommit to the amount of free space we could possibly
3794 * allocate for chunks.
3795 */
3796 to_add = min(avail, to_add);
3686 3797
3687 if (used + bytes < space_info->total_bytes + avail) 3798 if (used + bytes < space_info->total_bytes + to_add)
3688 return 1; 3799 return 1;
3689 return 0; 3800 return 0;
3690} 3801}
3691 3802
3692static int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb, 3803void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3693 unsigned long nr_pages, 3804 unsigned long nr_pages)
3694 enum wb_reason reason)
3695{ 3805{
3696 if (!writeback_in_progress(sb->s_bdi) && 3806 struct super_block *sb = root->fs_info->sb;
3697 down_read_trylock(&sb->s_umount)) { 3807 int started;
3698 writeback_inodes_sb_nr(sb, nr_pages, reason);
3699 up_read(&sb->s_umount);
3700 return 1;
3701 }
3702 3808
3703 return 0; 3809 /* If we can not start writeback, just sync all the delalloc file. */
3810 started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3811 WB_REASON_FS_FREE_SPACE);
3812 if (!started) {
3813 /*
3814 * We needn't worry the filesystem going from r/w to r/o though
3815 * we don't acquire ->s_umount mutex, because the filesystem
3816 * should guarantee the delalloc inodes list be empty after
3817 * the filesystem is readonly(all dirty pages are written to
3818 * the disk).
3819 */
3820 btrfs_start_delalloc_inodes(root, 0);
3821 btrfs_wait_ordered_extents(root, 0);
3822 }
3704} 3823}
3705 3824
3706/* 3825/*
@@ -3724,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3724 space_info = block_rsv->space_info; 3843 space_info = block_rsv->space_info;
3725 3844
3726 smp_mb(); 3845 smp_mb();
3727 delalloc_bytes = root->fs_info->delalloc_bytes; 3846 delalloc_bytes = percpu_counter_sum_positive(
3847 &root->fs_info->delalloc_bytes);
3728 if (delalloc_bytes == 0) { 3848 if (delalloc_bytes == 0) {
3729 if (trans) 3849 if (trans)
3730 return; 3850 return;
@@ -3735,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3735 while (delalloc_bytes && loops < 3) { 3855 while (delalloc_bytes && loops < 3) {
3736 max_reclaim = min(delalloc_bytes, to_reclaim); 3856 max_reclaim = min(delalloc_bytes, to_reclaim);
3737 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3857 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3738 writeback_inodes_sb_nr_if_idle_safe(root->fs_info->sb, 3858 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3739 nr_pages,
3740 WB_REASON_FS_FREE_SPACE);
3741
3742 /* 3859 /*
3743 * We need to wait for the async pages to actually start before 3860 * We need to wait for the async pages to actually start before
3744 * we do anything. 3861 * we do anything.
@@ -3766,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3766 break; 3883 break;
3767 } 3884 }
3768 smp_mb(); 3885 smp_mb();
3769 delalloc_bytes = root->fs_info->delalloc_bytes; 3886 delalloc_bytes = percpu_counter_sum_positive(
3887 &root->fs_info->delalloc_bytes);
3770 } 3888 }
3771} 3889}
3772 3890
@@ -4030,6 +4148,15 @@ again:
4030 goto again; 4148 goto again;
4031 4149
4032out: 4150out:
4151 if (ret == -ENOSPC &&
4152 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4153 struct btrfs_block_rsv *global_rsv =
4154 &root->fs_info->global_block_rsv;
4155
4156 if (block_rsv != global_rsv &&
4157 !block_rsv_use_bytes(global_rsv, orig_bytes))
4158 ret = 0;
4159 }
4033 if (flushing) { 4160 if (flushing) {
4034 spin_lock(&space_info->lock); 4161 spin_lock(&space_info->lock);
4035 space_info->flush = 0; 4162 space_info->flush = 0;
@@ -4416,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4416 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4543 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4417} 4544}
4418 4545
4419int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4546/*
4420 struct btrfs_pending_snapshot *pending) 4547 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4548 * root: the root of the parent directory
4549 * rsv: block reservation
4550 * items: the number of items that we need do reservation
4551 * qgroup_reserved: used to return the reserved size in qgroup
4552 *
4553 * This function is used to reserve the space for snapshot/subvolume
4554 * creation and deletion. Those operations are different with the
4555 * common file/directory operations, they change two fs/file trees
4556 * and root tree, the number of items that the qgroup reserves is
4557 * different with the free space reservation. So we can not use
4558 * the space reseravtion mechanism in start_transaction().
4559 */
4560int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4561 struct btrfs_block_rsv *rsv,
4562 int items,
4563 u64 *qgroup_reserved)
4421{ 4564{
4422 struct btrfs_root *root = pending->root; 4565 u64 num_bytes;
4423 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4566 int ret;
4424 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4567
4425 /* 4568 if (root->fs_info->quota_enabled) {
4426 * two for root back/forward refs, two for directory entries, 4569 /* One for parent inode, two for dir entries */
4427 * one for root of the snapshot and one for parent inode. 4570 num_bytes = 3 * root->leafsize;
4428 */ 4571 ret = btrfs_qgroup_reserve(root, num_bytes);
4429 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); 4572 if (ret)
4430 dst_rsv->space_info = src_rsv->space_info; 4573 return ret;
4431 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4574 } else {
4575 num_bytes = 0;
4576 }
4577
4578 *qgroup_reserved = num_bytes;
4579
4580 num_bytes = btrfs_calc_trans_metadata_size(root, items);
4581 rsv->space_info = __find_space_info(root->fs_info,
4582 BTRFS_BLOCK_GROUP_METADATA);
4583 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4584 BTRFS_RESERVE_FLUSH_ALL);
4585 if (ret) {
4586 if (*qgroup_reserved)
4587 btrfs_qgroup_free(root, *qgroup_reserved);
4588 }
4589
4590 return ret;
4591}
4592
4593void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4594 struct btrfs_block_rsv *rsv,
4595 u64 qgroup_reserved)
4596{
4597 btrfs_block_rsv_release(root, rsv, (u64)-1);
4598 if (qgroup_reserved)
4599 btrfs_qgroup_free(root, qgroup_reserved);
4432} 4600}
4433 4601
4434/** 4602/**
@@ -4536,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4536 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4704 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4537 int ret = 0; 4705 int ret = 0;
4538 bool delalloc_lock = true; 4706 bool delalloc_lock = true;
4707 u64 to_free = 0;
4708 unsigned dropped;
4539 4709
4540 /* If we are a free space inode we need to not flush since we will be in 4710 /* If we are a free space inode we need to not flush since we will be in
4541 * the middle of a transaction commit. We also don't need the delalloc 4711 * the middle of a transaction commit. We also don't need the delalloc
@@ -4579,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4579 csum_bytes = BTRFS_I(inode)->csum_bytes; 4749 csum_bytes = BTRFS_I(inode)->csum_bytes;
4580 spin_unlock(&BTRFS_I(inode)->lock); 4750 spin_unlock(&BTRFS_I(inode)->lock);
4581 4751
4582 if (root->fs_info->quota_enabled) 4752 if (root->fs_info->quota_enabled) {
4583 ret = btrfs_qgroup_reserve(root, num_bytes + 4753 ret = btrfs_qgroup_reserve(root, num_bytes +
4584 nr_extents * root->leafsize); 4754 nr_extents * root->leafsize);
4755 if (ret)
4756 goto out_fail;
4757 }
4585 4758
4586 /* 4759 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4587 * ret != 0 here means the qgroup reservation failed, we go straight to 4760 if (unlikely(ret)) {
4588 * the shared error handling then. 4761 if (root->fs_info->quota_enabled)
4589 */
4590 if (ret == 0)
4591 ret = reserve_metadata_bytes(root, block_rsv,
4592 to_reserve, flush);
4593
4594 if (ret) {
4595 u64 to_free = 0;
4596 unsigned dropped;
4597
4598 spin_lock(&BTRFS_I(inode)->lock);
4599 dropped = drop_outstanding_extent(inode);
4600 /*
4601 * If the inodes csum_bytes is the same as the original
4602 * csum_bytes then we know we haven't raced with any free()ers
4603 * so we can just reduce our inodes csum bytes and carry on.
4604 * Otherwise we have to do the normal free thing to account for
4605 * the case that the free side didn't free up its reserve
4606 * because of this outstanding reservation.
4607 */
4608 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4609 calc_csum_metadata_size(inode, num_bytes, 0);
4610 else
4611 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4612 spin_unlock(&BTRFS_I(inode)->lock);
4613 if (dropped)
4614 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4615
4616 if (to_free) {
4617 btrfs_block_rsv_release(root, block_rsv, to_free);
4618 trace_btrfs_space_reservation(root->fs_info,
4619 "delalloc",
4620 btrfs_ino(inode),
4621 to_free, 0);
4622 }
4623 if (root->fs_info->quota_enabled) {
4624 btrfs_qgroup_free(root, num_bytes + 4762 btrfs_qgroup_free(root, num_bytes +
4625 nr_extents * root->leafsize); 4763 nr_extents * root->leafsize);
4626 } 4764 goto out_fail;
4627 if (delalloc_lock)
4628 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4629 return ret;
4630 } 4765 }
4631 4766
4632 spin_lock(&BTRFS_I(inode)->lock); 4767 spin_lock(&BTRFS_I(inode)->lock);
@@ -4647,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4647 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4782 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4648 4783
4649 return 0; 4784 return 0;
4785
4786out_fail:
4787 spin_lock(&BTRFS_I(inode)->lock);
4788 dropped = drop_outstanding_extent(inode);
4789 /*
4790 * If the inodes csum_bytes is the same as the original
4791 * csum_bytes then we know we haven't raced with any free()ers
4792 * so we can just reduce our inodes csum bytes and carry on.
4793 * Otherwise we have to do the normal free thing to account for
4794 * the case that the free side didn't free up its reserve
4795 * because of this outstanding reservation.
4796 */
4797 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4798 calc_csum_metadata_size(inode, num_bytes, 0);
4799 else
4800 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4801 spin_unlock(&BTRFS_I(inode)->lock);
4802 if (dropped)
4803 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4804
4805 if (to_free) {
4806 btrfs_block_rsv_release(root, block_rsv, to_free);
4807 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4808 btrfs_ino(inode), to_free, 0);
4809 }
4810 if (delalloc_lock)
4811 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4812 return ret;
4650} 4813}
4651 4814
4652/** 4815/**
@@ -4668,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4668 spin_lock(&BTRFS_I(inode)->lock); 4831 spin_lock(&BTRFS_I(inode)->lock);
4669 dropped = drop_outstanding_extent(inode); 4832 dropped = drop_outstanding_extent(inode);
4670 4833
4671 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4834 if (num_bytes)
4835 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4672 spin_unlock(&BTRFS_I(inode)->lock); 4836 spin_unlock(&BTRFS_I(inode)->lock);
4673 if (dropped > 0) 4837 if (dropped > 0)
4674 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4838 to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4735,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4735 btrfs_free_reserved_data_space(inode, num_bytes); 4899 btrfs_free_reserved_data_space(inode, num_bytes);
4736} 4900}
4737 4901
4738static int update_block_group(struct btrfs_trans_handle *trans, 4902static int update_block_group(struct btrfs_root *root,
4739 struct btrfs_root *root,
4740 u64 bytenr, u64 num_bytes, int alloc) 4903 u64 bytenr, u64 num_bytes, int alloc)
4741{ 4904{
4742 struct btrfs_block_group_cache *cache = NULL; 4905 struct btrfs_block_group_cache *cache = NULL;
@@ -4773,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4773 * space back to the block group, otherwise we will leak space. 4936 * space back to the block group, otherwise we will leak space.
4774 */ 4937 */
4775 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4938 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4776 cache_block_group(cache, trans, NULL, 1); 4939 cache_block_group(cache, 1);
4777 4940
4778 byte_in_group = bytenr - cache->key.objectid; 4941 byte_in_group = bytenr - cache->key.objectid;
4779 WARN_ON(byte_in_group > cache->key.offset); 4942 WARN_ON(byte_in_group > cache->key.offset);
@@ -4823,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4823 struct btrfs_block_group_cache *cache; 4986 struct btrfs_block_group_cache *cache;
4824 u64 bytenr; 4987 u64 bytenr;
4825 4988
4989 spin_lock(&root->fs_info->block_group_cache_lock);
4990 bytenr = root->fs_info->first_logical_byte;
4991 spin_unlock(&root->fs_info->block_group_cache_lock);
4992
4993 if (bytenr < (u64)-1)
4994 return bytenr;
4995
4826 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4996 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4827 if (!cache) 4997 if (!cache)
4828 return 0; 4998 return 0;
@@ -4873,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
4873/* 5043/*
4874 * this function must be called within transaction 5044 * this function must be called within transaction
4875 */ 5045 */
4876int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 5046int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4877 struct btrfs_root *root,
4878 u64 bytenr, u64 num_bytes) 5047 u64 bytenr, u64 num_bytes)
4879{ 5048{
4880 struct btrfs_block_group_cache *cache; 5049 struct btrfs_block_group_cache *cache;
@@ -4888,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4888 * to one because the slow code to read in the free extents does check 5057 * to one because the slow code to read in the free extents does check
4889 * the pinned extents. 5058 * the pinned extents.
4890 */ 5059 */
4891 cache_block_group(cache, trans, root, 1); 5060 cache_block_group(cache, 1);
4892 5061
4893 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5062 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4894 5063
@@ -5285,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5285 } 5454 }
5286 } 5455 }
5287 5456
5288 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5457 ret = update_block_group(root, bytenr, num_bytes, 0);
5289 if (ret) { 5458 if (ret) {
5290 btrfs_abort_transaction(trans, extent_root, ret); 5459 btrfs_abort_transaction(trans, extent_root, ret);
5291 goto out; 5460 goto out;
@@ -5330,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5330 if (head->extent_op) { 5499 if (head->extent_op) {
5331 if (!head->must_insert_reserved) 5500 if (!head->must_insert_reserved)
5332 goto out; 5501 goto out;
5333 kfree(head->extent_op); 5502 btrfs_free_delayed_extent_op(head->extent_op);
5334 head->extent_op = NULL; 5503 head->extent_op = NULL;
5335 } 5504 }
5336 5505
@@ -5453,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5453 return ret; 5622 return ret;
5454} 5623}
5455 5624
5456static u64 stripe_align(struct btrfs_root *root, u64 val) 5625static u64 stripe_align(struct btrfs_root *root,
5626 struct btrfs_block_group_cache *cache,
5627 u64 val, u64 num_bytes)
5457{ 5628{
5458 u64 mask = ((u64)root->stripesize - 1); 5629 u64 ret = ALIGN(val, root->stripesize);
5459 u64 ret = (val + mask) & ~mask;
5460 return ret; 5630 return ret;
5461} 5631}
5462 5632
@@ -5476,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5476 u64 num_bytes) 5646 u64 num_bytes)
5477{ 5647{
5478 struct btrfs_caching_control *caching_ctl; 5648 struct btrfs_caching_control *caching_ctl;
5479 DEFINE_WAIT(wait);
5480 5649
5481 caching_ctl = get_caching_control(cache); 5650 caching_ctl = get_caching_control(cache);
5482 if (!caching_ctl) 5651 if (!caching_ctl)
@@ -5493,7 +5662,6 @@ static noinline int
5493wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5662wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5494{ 5663{
5495 struct btrfs_caching_control *caching_ctl; 5664 struct btrfs_caching_control *caching_ctl;
5496 DEFINE_WAIT(wait);
5497 5665
5498 caching_ctl = get_caching_control(cache); 5666 caching_ctl = get_caching_control(cache);
5499 if (!caching_ctl) 5667 if (!caching_ctl)
@@ -5507,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5507 5675
5508int __get_raid_index(u64 flags) 5676int __get_raid_index(u64 flags)
5509{ 5677{
5510 int index;
5511
5512 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5678 if (flags & BTRFS_BLOCK_GROUP_RAID10)
5513 index = 0; 5679 return BTRFS_RAID_RAID10;
5514 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5680 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5515 index = 1; 5681 return BTRFS_RAID_RAID1;
5516 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5682 else if (flags & BTRFS_BLOCK_GROUP_DUP)
5517 index = 2; 5683 return BTRFS_RAID_DUP;
5518 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5684 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5519 index = 3; 5685 return BTRFS_RAID_RAID0;
5520 else 5686 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5521 index = 4; 5687 return BTRFS_RAID_RAID5;
5688 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5689 return BTRFS_RAID_RAID6;
5522 5690
5523 return index; 5691 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5524} 5692}
5525 5693
5526static int get_block_group_index(struct btrfs_block_group_cache *cache) 5694static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5663,6 +5831,8 @@ search:
5663 if (!block_group_bits(block_group, data)) { 5831 if (!block_group_bits(block_group, data)) {
5664 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5832 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5665 BTRFS_BLOCK_GROUP_RAID1 | 5833 BTRFS_BLOCK_GROUP_RAID1 |
5834 BTRFS_BLOCK_GROUP_RAID5 |
5835 BTRFS_BLOCK_GROUP_RAID6 |
5666 BTRFS_BLOCK_GROUP_RAID10; 5836 BTRFS_BLOCK_GROUP_RAID10;
5667 5837
5668 /* 5838 /*
@@ -5678,8 +5848,7 @@ have_block_group:
5678 cached = block_group_cache_done(block_group); 5848 cached = block_group_cache_done(block_group);
5679 if (unlikely(!cached)) { 5849 if (unlikely(!cached)) {
5680 found_uncached_bg = true; 5850 found_uncached_bg = true;
5681 ret = cache_block_group(block_group, trans, 5851 ret = cache_block_group(block_group, 0);
5682 orig_root, 0);
5683 BUG_ON(ret < 0); 5852 BUG_ON(ret < 0);
5684 ret = 0; 5853 ret = 0;
5685 } 5854 }
@@ -5692,6 +5861,7 @@ have_block_group:
5692 * lets look there 5861 * lets look there
5693 */ 5862 */
5694 if (last_ptr) { 5863 if (last_ptr) {
5864 unsigned long aligned_cluster;
5695 /* 5865 /*
5696 * the refill lock keeps out other 5866 * the refill lock keeps out other
5697 * people trying to start a new cluster 5867 * people trying to start a new cluster
@@ -5758,11 +5928,15 @@ refill_cluster:
5758 goto unclustered_alloc; 5928 goto unclustered_alloc;
5759 } 5929 }
5760 5930
5931 aligned_cluster = max_t(unsigned long,
5932 empty_cluster + empty_size,
5933 block_group->full_stripe_len);
5934
5761 /* allocate a cluster in this block group */ 5935 /* allocate a cluster in this block group */
5762 ret = btrfs_find_space_cluster(trans, root, 5936 ret = btrfs_find_space_cluster(trans, root,
5763 block_group, last_ptr, 5937 block_group, last_ptr,
5764 search_start, num_bytes, 5938 search_start, num_bytes,
5765 empty_cluster + empty_size); 5939 aligned_cluster);
5766 if (ret == 0) { 5940 if (ret == 0) {
5767 /* 5941 /*
5768 * now pull our allocation out of this 5942 * now pull our allocation out of this
@@ -5833,7 +6007,8 @@ unclustered_alloc:
5833 goto loop; 6007 goto loop;
5834 } 6008 }
5835checks: 6009checks:
5836 search_start = stripe_align(root, offset); 6010 search_start = stripe_align(root, used_block_group,
6011 offset, num_bytes);
5837 6012
5838 /* move on to the next group */ 6013 /* move on to the next group */
5839 if (search_start + num_bytes > 6014 if (search_start + num_bytes >
@@ -5984,7 +6159,7 @@ again:
5984 if (ret == -ENOSPC) { 6159 if (ret == -ENOSPC) {
5985 if (!final_tried) { 6160 if (!final_tried) {
5986 num_bytes = num_bytes >> 1; 6161 num_bytes = num_bytes >> 1;
5987 num_bytes = num_bytes & ~(root->sectorsize - 1); 6162 num_bytes = round_down(num_bytes, root->sectorsize);
5988 num_bytes = max(num_bytes, min_alloc_size); 6163 num_bytes = max(num_bytes, min_alloc_size);
5989 if (num_bytes == min_alloc_size) 6164 if (num_bytes == min_alloc_size)
5990 final_tried = true; 6165 final_tried = true;
@@ -6108,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6108 btrfs_mark_buffer_dirty(path->nodes[0]); 6283 btrfs_mark_buffer_dirty(path->nodes[0]);
6109 btrfs_free_path(path); 6284 btrfs_free_path(path);
6110 6285
6111 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6286 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6112 if (ret) { /* -ENOENT, logic error */ 6287 if (ret) { /* -ENOENT, logic error */
6113 printk(KERN_ERR "btrfs update block group failed for %llu " 6288 printk(KERN_ERR "btrfs update block group failed for %llu "
6114 "%llu\n", (unsigned long long)ins->objectid, 6289 "%llu\n", (unsigned long long)ins->objectid,
@@ -6172,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6172 btrfs_mark_buffer_dirty(leaf); 6347 btrfs_mark_buffer_dirty(leaf);
6173 btrfs_free_path(path); 6348 btrfs_free_path(path);
6174 6349
6175 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6350 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6176 if (ret) { /* -ENOENT, logic error */ 6351 if (ret) { /* -ENOENT, logic error */
6177 printk(KERN_ERR "btrfs update block group failed for %llu " 6352 printk(KERN_ERR "btrfs update block group failed for %llu "
6178 "%llu\n", (unsigned long long)ins->objectid, 6353 "%llu\n", (unsigned long long)ins->objectid,
@@ -6215,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6215 u64 num_bytes = ins->offset; 6390 u64 num_bytes = ins->offset;
6216 6391
6217 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6392 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6218 cache_block_group(block_group, trans, NULL, 0); 6393 cache_block_group(block_group, 0);
6219 caching_ctl = get_caching_control(block_group); 6394 caching_ctl = get_caching_control(block_group);
6220 6395
6221 if (!caching_ctl) { 6396 if (!caching_ctl) {
@@ -6329,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6329 if (!ret) 6504 if (!ret)
6330 return block_rsv; 6505 return block_rsv;
6331 if (ret && !block_rsv->failfast) { 6506 if (ret && !block_rsv->failfast) {
6332 static DEFINE_RATELIMIT_STATE(_rs, 6507 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6333 DEFAULT_RATELIMIT_INTERVAL, 6508 static DEFINE_RATELIMIT_STATE(_rs,
6334 /*DEFAULT_RATELIMIT_BURST*/ 2); 6509 DEFAULT_RATELIMIT_INTERVAL * 10,
6335 if (__ratelimit(&_rs)) 6510 /*DEFAULT_RATELIMIT_BURST*/ 1);
6336 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6511 if (__ratelimit(&_rs))
6337 ret); 6512 WARN(1, KERN_DEBUG
6513 "btrfs: block rsv returned %d\n", ret);
6514 }
6338 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6515 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6339 BTRFS_RESERVE_NO_FLUSH); 6516 BTRFS_RESERVE_NO_FLUSH);
6340 if (!ret) { 6517 if (!ret) {
@@ -6400,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6400 6577
6401 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6578 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6402 struct btrfs_delayed_extent_op *extent_op; 6579 struct btrfs_delayed_extent_op *extent_op;
6403 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 6580 extent_op = btrfs_alloc_delayed_extent_op();
6404 BUG_ON(!extent_op); /* -ENOMEM */ 6581 BUG_ON(!extent_op); /* -ENOMEM */
6405 if (key) 6582 if (key)
6406 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6583 memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -6522,7 +6699,7 @@ reada:
6522} 6699}
6523 6700
6524/* 6701/*
6525 * hepler to process tree block while walking down the tree. 6702 * helper to process tree block while walking down the tree.
6526 * 6703 *
6527 * when wc->stage == UPDATE_BACKREF, this function updates 6704 * when wc->stage == UPDATE_BACKREF, this function updates
6528 * back refs for pointers in the block. 6705 * back refs for pointers in the block.
@@ -6597,7 +6774,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6597} 6774}
6598 6775
6599/* 6776/*
6600 * hepler to process tree block pointer. 6777 * helper to process tree block pointer.
6601 * 6778 *
6602 * when wc->stage == DROP_REFERENCE, this function checks 6779 * when wc->stage == DROP_REFERENCE, this function checks
6603 * reference count of the block pointed to. if the block 6780 * reference count of the block pointed to. if the block
@@ -6735,7 +6912,7 @@ skip:
6735} 6912}
6736 6913
6737/* 6914/*
6738 * hepler to process tree block while walking up the tree. 6915 * helper to process tree block while walking up the tree.
6739 * 6916 *
6740 * when wc->stage == DROP_REFERENCE, this function drops 6917 * when wc->stage == DROP_REFERENCE, this function drops
6741 * reference count on the block. 6918 * reference count on the block.
@@ -7203,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7203 root->fs_info->fs_devices->missing_devices; 7380 root->fs_info->fs_devices->missing_devices;
7204 7381
7205 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7382 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7383 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7206 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7384 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7207 7385
7208 if (num_devices == 1) { 7386 if (num_devices == 1) {
@@ -7481,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7481 index = get_block_group_index(block_group); 7659 index = get_block_group_index(block_group);
7482 } 7660 }
7483 7661
7484 if (index == 0) { 7662 if (index == BTRFS_RAID_RAID10) {
7485 dev_min = 4; 7663 dev_min = 4;
7486 /* Divide by 2 */ 7664 /* Divide by 2 */
7487 min_free >>= 1; 7665 min_free >>= 1;
7488 } else if (index == 1) { 7666 } else if (index == BTRFS_RAID_RAID1) {
7489 dev_min = 2; 7667 dev_min = 2;
7490 } else if (index == 2) { 7668 } else if (index == BTRFS_RAID_DUP) {
7491 /* Multiply by 2 */ 7669 /* Multiply by 2 */
7492 min_free <<= 1; 7670 min_free <<= 1;
7493 } else if (index == 3) { 7671 } else if (index == BTRFS_RAID_RAID0) {
7494 dev_min = fs_devices->rw_devices; 7672 dev_min = fs_devices->rw_devices;
7495 do_div(min_free, dev_min); 7673 do_div(min_free, dev_min);
7496 } 7674 }
@@ -7651,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7651 space_info = list_entry(info->space_info.next, 7829 space_info = list_entry(info->space_info.next,
7652 struct btrfs_space_info, 7830 struct btrfs_space_info,
7653 list); 7831 list);
7654 if (space_info->bytes_pinned > 0 || 7832 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7655 space_info->bytes_reserved > 0 || 7833 if (space_info->bytes_pinned > 0 ||
7656 space_info->bytes_may_use > 0) { 7834 space_info->bytes_reserved > 0 ||
7657 WARN_ON(1); 7835 space_info->bytes_may_use > 0) {
7658 dump_space_info(space_info, 0, 0); 7836 WARN_ON(1);
7837 dump_space_info(space_info, 0, 0);
7838 }
7659 } 7839 }
7660 list_del(&space_info->list); 7840 list_del(&space_info->list);
7661 kfree(space_info); 7841 kfree(space_info);
@@ -7754,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7754 btrfs_release_path(path); 7934 btrfs_release_path(path);
7755 cache->flags = btrfs_block_group_flags(&cache->item); 7935 cache->flags = btrfs_block_group_flags(&cache->item);
7756 cache->sectorsize = root->sectorsize; 7936 cache->sectorsize = root->sectorsize;
7757 7937 cache->full_stripe_len = btrfs_full_stripe_len(root,
7938 &root->fs_info->mapping_tree,
7939 found_key.objectid);
7758 btrfs_init_free_space_ctl(cache); 7940 btrfs_init_free_space_ctl(cache);
7759 7941
7760 /* 7942 /*
@@ -7808,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7808 if (!(get_alloc_profile(root, space_info->flags) & 7990 if (!(get_alloc_profile(root, space_info->flags) &
7809 (BTRFS_BLOCK_GROUP_RAID10 | 7991 (BTRFS_BLOCK_GROUP_RAID10 |
7810 BTRFS_BLOCK_GROUP_RAID1 | 7992 BTRFS_BLOCK_GROUP_RAID1 |
7993 BTRFS_BLOCK_GROUP_RAID5 |
7994 BTRFS_BLOCK_GROUP_RAID6 |
7811 BTRFS_BLOCK_GROUP_DUP))) 7995 BTRFS_BLOCK_GROUP_DUP)))
7812 continue; 7996 continue;
7813 /* 7997 /*
@@ -7883,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7883 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8067 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7884 cache->sectorsize = root->sectorsize; 8068 cache->sectorsize = root->sectorsize;
7885 cache->fs_info = root->fs_info; 8069 cache->fs_info = root->fs_info;
8070 cache->full_stripe_len = btrfs_full_stripe_len(root,
8071 &root->fs_info->mapping_tree,
8072 chunk_offset);
7886 8073
7887 atomic_set(&cache->count, 1); 8074 atomic_set(&cache->count, 1);
7888 spin_lock_init(&cache->lock); 8075 spin_lock_init(&cache->lock);
@@ -7932,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7932 u64 extra_flags = chunk_to_extended(flags) & 8119 u64 extra_flags = chunk_to_extended(flags) &
7933 BTRFS_EXTENDED_PROFILE_MASK; 8120 BTRFS_EXTENDED_PROFILE_MASK;
7934 8121
8122 write_seqlock(&fs_info->profiles_lock);
7935 if (flags & BTRFS_BLOCK_GROUP_DATA) 8123 if (flags & BTRFS_BLOCK_GROUP_DATA)
7936 fs_info->avail_data_alloc_bits &= ~extra_flags; 8124 fs_info->avail_data_alloc_bits &= ~extra_flags;
7937 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8125 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7938 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8126 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7939 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8127 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7940 fs_info->avail_system_alloc_bits &= ~extra_flags; 8128 fs_info->avail_system_alloc_bits &= ~extra_flags;
8129 write_sequnlock(&fs_info->profiles_lock);
7941} 8130}
7942 8131
7943int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8132int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8036,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8036 spin_lock(&root->fs_info->block_group_cache_lock); 8225 spin_lock(&root->fs_info->block_group_cache_lock);
8037 rb_erase(&block_group->cache_node, 8226 rb_erase(&block_group->cache_node,
8038 &root->fs_info->block_group_cache_tree); 8227 &root->fs_info->block_group_cache_tree);
8228
8229 if (root->fs_info->first_logical_byte == block_group->key.objectid)
8230 root->fs_info->first_logical_byte = (u64)-1;
8039 spin_unlock(&root->fs_info->block_group_cache_lock); 8231 spin_unlock(&root->fs_info->block_group_cache_lock);
8040 8232
8041 down_write(&block_group->space_info->groups_sem); 8233 down_write(&block_group->space_info->groups_sem);
@@ -8158,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8158 8350
8159 if (end - start >= range->minlen) { 8351 if (end - start >= range->minlen) {
8160 if (!block_group_cache_done(cache)) { 8352 if (!block_group_cache_done(cache)) {
8161 ret = cache_block_group(cache, NULL, root, 0); 8353 ret = cache_block_group(cache, 0);
8162 if (!ret) 8354 if (!ret)
8163 wait_block_group_cache_done(cache); 8355 wait_block_group_cache_done(cache);
8164 } 8356 }