aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent-tree.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent-tree.c')
-rw-r--r--fs/btrfs/extent-tree.c578
1 files changed, 392 insertions, 186 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf54bdfee334..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
31#include "print-tree.h" 31#include "print-tree.h"
32#include "transaction.h" 32#include "transaction.h"
33#include "volumes.h" 33#include "volumes.h"
34#include "raid56.h"
34#include "locking.h" 35#include "locking.h"
35#include "free-space-cache.h" 36#include "free-space-cache.h"
36#include "math.h" 37#include "math.h"
@@ -72,8 +73,7 @@ enum {
72 RESERVE_ALLOC_NO_ACCOUNT = 2, 73 RESERVE_ALLOC_NO_ACCOUNT = 2,
73}; 74};
74 75
75static int update_block_group(struct btrfs_trans_handle *trans, 76static int update_block_group(struct btrfs_root *root,
76 struct btrfs_root *root,
77 u64 bytenr, u64 num_bytes, int alloc); 77 u64 bytenr, u64 num_bytes, int alloc);
78static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 78static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79 struct btrfs_root *root, 79 struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103 int dump_block_groups); 103 int dump_block_groups);
104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 104static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105 u64 num_bytes, int reserve); 105 u64 num_bytes, int reserve);
106static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107 u64 num_bytes);
106 108
107static noinline int 109static noinline int
108block_group_cache_done(struct btrfs_block_group_cache *cache) 110block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
162 rb_link_node(&block_group->cache_node, parent, p); 164 rb_link_node(&block_group->cache_node, parent, p);
163 rb_insert_color(&block_group->cache_node, 165 rb_insert_color(&block_group->cache_node,
164 &info->block_group_cache_tree); 166 &info->block_group_cache_tree);
167
168 if (info->first_logical_byte > block_group->key.objectid)
169 info->first_logical_byte = block_group->key.objectid;
170
165 spin_unlock(&info->block_group_cache_lock); 171 spin_unlock(&info->block_group_cache_lock);
166 172
167 return 0; 173 return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
203 break; 209 break;
204 } 210 }
205 } 211 }
206 if (ret) 212 if (ret) {
207 btrfs_get_block_group(ret); 213 btrfs_get_block_group(ret);
214 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215 info->first_logical_byte = ret->key.objectid;
216 }
208 spin_unlock(&info->block_group_cache_lock); 217 spin_unlock(&info->block_group_cache_lock);
209 218
210 return ret; 219 return ret;
@@ -468,8 +477,6 @@ out:
468} 477}
469 478
470static int cache_block_group(struct btrfs_block_group_cache *cache, 479static int cache_block_group(struct btrfs_block_group_cache *cache,
471 struct btrfs_trans_handle *trans,
472 struct btrfs_root *root,
473 int load_cache_only) 480 int load_cache_only)
474{ 481{
475 DEFINE_WAIT(wait); 482 DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
527 cache->cached = BTRFS_CACHE_FAST; 534 cache->cached = BTRFS_CACHE_FAST;
528 spin_unlock(&cache->lock); 535 spin_unlock(&cache->lock);
529 536
530 /*
531 * We can't do the read from on-disk cache during a commit since we need
532 * to have the normal tree locking. Also if we are currently trying to
533 * allocate blocks for the tree root we can't do the fast caching since
534 * we likely hold important locks.
535 */
536 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { 537 if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537 ret = load_free_space_cache(fs_info, cache); 538 ret = load_free_space_cache(fs_info, cache);
538 539
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1852 *actual_bytes = discarded_bytes; 1853 *actual_bytes = discarded_bytes;
1853 1854
1854 1855
1856 if (ret == -EOPNOTSUPP)
1857 ret = 0;
1855 return ret; 1858 return ret;
1856} 1859}
1857 1860
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2143 node->num_bytes); 2146 node->num_bytes);
2144 } 2147 }
2145 } 2148 }
2146 mutex_unlock(&head->mutex);
2147 return ret; 2149 return ret;
2148 } 2150 }
2149 2151
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2258 * process of being added. Don't run this ref yet. 2260 * process of being added. Don't run this ref yet.
2259 */ 2261 */
2260 list_del_init(&locked_ref->cluster); 2262 list_del_init(&locked_ref->cluster);
2261 mutex_unlock(&locked_ref->mutex); 2263 btrfs_delayed_ref_unlock(locked_ref);
2262 locked_ref = NULL; 2264 locked_ref = NULL;
2263 delayed_refs->num_heads_ready++; 2265 delayed_refs->num_heads_ready++;
2264 spin_unlock(&delayed_refs->lock); 2266 spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2285 ref = &locked_ref->node; 2287 ref = &locked_ref->node;
2286 2288
2287 if (extent_op && must_insert_reserved) { 2289 if (extent_op && must_insert_reserved) {
2288 kfree(extent_op); 2290 btrfs_free_delayed_extent_op(extent_op);
2289 extent_op = NULL; 2291 extent_op = NULL;
2290 } 2292 }
2291 2293
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2294 2296
2295 ret = run_delayed_extent_op(trans, root, 2297 ret = run_delayed_extent_op(trans, root,
2296 ref, extent_op); 2298 ref, extent_op);
2297 kfree(extent_op); 2299 btrfs_free_delayed_extent_op(extent_op);
2298 2300
2299 if (ret) { 2301 if (ret) {
2300 list_del_init(&locked_ref->cluster); 2302 printk(KERN_DEBUG
2301 mutex_unlock(&locked_ref->mutex); 2303 "btrfs: run_delayed_extent_op "
2302 2304 "returned %d\n", ret);
2303 printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
2304 spin_lock(&delayed_refs->lock); 2305 spin_lock(&delayed_refs->lock);
2306 btrfs_delayed_ref_unlock(locked_ref);
2305 return ret; 2307 return ret;
2306 } 2308 }
2307 2309
2308 goto next; 2310 goto next;
2309 } 2311 }
2310
2311 list_del_init(&locked_ref->cluster);
2312 locked_ref = NULL;
2313 } 2312 }
2314 2313
2315 ref->in_tree = 0; 2314 ref->in_tree = 0;
2316 rb_erase(&ref->rb_node, &delayed_refs->root); 2315 rb_erase(&ref->rb_node, &delayed_refs->root);
2317 delayed_refs->num_entries--; 2316 delayed_refs->num_entries--;
2318 if (locked_ref) { 2317 if (!btrfs_delayed_ref_is_head(ref)) {
2319 /* 2318 /*
2320 * when we play the delayed ref, also correct the 2319 * when we play the delayed ref, also correct the
2321 * ref_mod on head 2320 * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2337 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2336 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2338 must_insert_reserved); 2337 must_insert_reserved);
2339 2338
2340 btrfs_put_delayed_ref(ref); 2339 btrfs_free_delayed_extent_op(extent_op);
2341 kfree(extent_op);
2342 count++;
2343
2344 if (ret) { 2340 if (ret) {
2345 if (locked_ref) { 2341 btrfs_delayed_ref_unlock(locked_ref);
2346 list_del_init(&locked_ref->cluster); 2342 btrfs_put_delayed_ref(ref);
2347 mutex_unlock(&locked_ref->mutex); 2343 printk(KERN_DEBUG
2348 } 2344 "btrfs: run_one_delayed_ref returned %d\n", ret);
2349 printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
2350 spin_lock(&delayed_refs->lock); 2345 spin_lock(&delayed_refs->lock);
2351 return ret; 2346 return ret;
2352 } 2347 }
2353 2348
2349 /*
2350 * If this node is a head, that means all the refs in this head
2351 * have been dealt with, and we will pick the next head to deal
2352 * with, so we must unlock the head and drop it from the cluster
2353 * list before we release it.
2354 */
2355 if (btrfs_delayed_ref_is_head(ref)) {
2356 list_del_init(&locked_ref->cluster);
2357 btrfs_delayed_ref_unlock(locked_ref);
2358 locked_ref = NULL;
2359 }
2360 btrfs_put_delayed_ref(ref);
2361 count++;
2354next: 2362next:
2355 cond_resched(); 2363 cond_resched();
2356 spin_lock(&delayed_refs->lock); 2364 spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2435 return ret; 2443 return ret;
2436} 2444}
2437 2445
2446static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2447 int count)
2448{
2449 int val = atomic_read(&delayed_refs->ref_seq);
2450
2451 if (val < seq || val >= seq + count)
2452 return 1;
2453 return 0;
2454}
2455
2438/* 2456/*
2439 * this starts processing the delayed reference count updates and 2457 * this starts processing the delayed reference count updates and
2440 * extent insertions we have queued up so far. count can be 2458 * extent insertions we have queued up so far. count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2469 2487
2470 delayed_refs = &trans->transaction->delayed_refs; 2488 delayed_refs = &trans->transaction->delayed_refs;
2471 INIT_LIST_HEAD(&cluster); 2489 INIT_LIST_HEAD(&cluster);
2490 if (count == 0) {
2491 count = delayed_refs->num_entries * 2;
2492 run_most = 1;
2493 }
2494
2495 if (!run_all && !run_most) {
2496 int old;
2497 int seq = atomic_read(&delayed_refs->ref_seq);
2498
2499progress:
2500 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2501 if (old) {
2502 DEFINE_WAIT(__wait);
2503 if (delayed_refs->num_entries < 16348)
2504 return 0;
2505
2506 prepare_to_wait(&delayed_refs->wait, &__wait,
2507 TASK_UNINTERRUPTIBLE);
2508
2509 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2510 if (old) {
2511 schedule();
2512 finish_wait(&delayed_refs->wait, &__wait);
2513
2514 if (!refs_newer(delayed_refs, seq, 256))
2515 goto progress;
2516 else
2517 return 0;
2518 } else {
2519 finish_wait(&delayed_refs->wait, &__wait);
2520 goto again;
2521 }
2522 }
2523
2524 } else {
2525 atomic_inc(&delayed_refs->procs_running_refs);
2526 }
2527
2472again: 2528again:
2473 loops = 0; 2529 loops = 0;
2474 spin_lock(&delayed_refs->lock); 2530 spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
2477 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2533 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2478#endif 2534#endif
2479 2535
2480 if (count == 0) {
2481 count = delayed_refs->num_entries * 2;
2482 run_most = 1;
2483 }
2484 while (1) { 2536 while (1) {
2485 if (!(run_all || run_most) && 2537 if (!(run_all || run_most) &&
2486 delayed_refs->num_heads_ready < 64) 2538 delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
2500 2552
2501 ret = run_clustered_refs(trans, root, &cluster); 2553 ret = run_clustered_refs(trans, root, &cluster);
2502 if (ret < 0) { 2554 if (ret < 0) {
2555 btrfs_release_ref_cluster(&cluster);
2503 spin_unlock(&delayed_refs->lock); 2556 spin_unlock(&delayed_refs->lock);
2504 btrfs_abort_transaction(trans, root, ret); 2557 btrfs_abort_transaction(trans, root, ret);
2558 atomic_dec(&delayed_refs->procs_running_refs);
2505 return ret; 2559 return ret;
2506 } 2560 }
2507 2561
2562 atomic_add(ret, &delayed_refs->ref_seq);
2563
2508 count -= min_t(unsigned long, ret, count); 2564 count -= min_t(unsigned long, ret, count);
2509 2565
2510 if (count == 0) 2566 if (count == 0)
@@ -2573,6 +2629,11 @@ again:
2573 goto again; 2629 goto again;
2574 } 2630 }
2575out: 2631out:
2632 atomic_dec(&delayed_refs->procs_running_refs);
2633 smp_mb();
2634 if (waitqueue_active(&delayed_refs->wait))
2635 wake_up(&delayed_refs->wait);
2636
2576 spin_unlock(&delayed_refs->lock); 2637 spin_unlock(&delayed_refs->lock);
2577 assert_qgroups_uptodate(trans); 2638 assert_qgroups_uptodate(trans);
2578 return 0; 2639 return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586 struct btrfs_delayed_extent_op *extent_op; 2647 struct btrfs_delayed_extent_op *extent_op;
2587 int ret; 2648 int ret;
2588 2649
2589 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 2650 extent_op = btrfs_alloc_delayed_extent_op();
2590 if (!extent_op) 2651 if (!extent_op)
2591 return -ENOMEM; 2652 return -ENOMEM;
2592 2653
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2598 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, 2659 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2599 num_bytes, extent_op); 2660 num_bytes, extent_op);
2600 if (ret) 2661 if (ret)
2601 kfree(extent_op); 2662 btrfs_free_delayed_extent_op(extent_op);
2602 return ret; 2663 return ret;
2603} 2664}
2604 2665
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3223 u64 extra_flags = chunk_to_extended(flags) & 3284 u64 extra_flags = chunk_to_extended(flags) &
3224 BTRFS_EXTENDED_PROFILE_MASK; 3285 BTRFS_EXTENDED_PROFILE_MASK;
3225 3286
3287 write_seqlock(&fs_info->profiles_lock);
3226 if (flags & BTRFS_BLOCK_GROUP_DATA) 3288 if (flags & BTRFS_BLOCK_GROUP_DATA)
3227 fs_info->avail_data_alloc_bits |= extra_flags; 3289 fs_info->avail_data_alloc_bits |= extra_flags;
3228 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3290 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3229 fs_info->avail_metadata_alloc_bits |= extra_flags; 3291 fs_info->avail_metadata_alloc_bits |= extra_flags;
3230 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3292 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3231 fs_info->avail_system_alloc_bits |= extra_flags; 3293 fs_info->avail_system_alloc_bits |= extra_flags;
3294 write_sequnlock(&fs_info->profiles_lock);
3232} 3295}
3233 3296
3234/* 3297/*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3339 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3277 root->fs_info->fs_devices->missing_devices; 3340 root->fs_info->fs_devices->missing_devices;
3278 u64 target; 3341 u64 target;
3342 u64 tmp;
3279 3343
3280 /* 3344 /*
3281 * see if restripe for this chunk_type is in progress, if so 3345 * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3292 } 3356 }
3293 spin_unlock(&root->fs_info->balance_lock); 3357 spin_unlock(&root->fs_info->balance_lock);
3294 3358
3359 /* First, mask out the RAID levels which aren't possible */
3295 if (num_devices == 1) 3360 if (num_devices == 1)
3296 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3361 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3362 BTRFS_BLOCK_GROUP_RAID5);
3363 if (num_devices < 3)
3364 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3297 if (num_devices < 4) 3365 if (num_devices < 4)
3298 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3366 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3299 3367
3300 if ((flags & BTRFS_BLOCK_GROUP_DUP) && 3368 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3301 (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3369 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3302 BTRFS_BLOCK_GROUP_RAID10))) { 3370 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3303 flags &= ~BTRFS_BLOCK_GROUP_DUP; 3371 flags &= ~tmp;
3304 }
3305
3306 if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3307 (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3308 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3309 }
3310 3372
3311 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3373 if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3312 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3374 tmp = BTRFS_BLOCK_GROUP_RAID6;
3313 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3375 else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3314 (flags & BTRFS_BLOCK_GROUP_DUP))) { 3376 tmp = BTRFS_BLOCK_GROUP_RAID5;
3315 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3377 else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3316 } 3378 tmp = BTRFS_BLOCK_GROUP_RAID10;
3379 else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3380 tmp = BTRFS_BLOCK_GROUP_RAID1;
3381 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3382 tmp = BTRFS_BLOCK_GROUP_RAID0;
3317 3383
3318 return extended_to_chunk(flags); 3384 return extended_to_chunk(flags | tmp);
3319} 3385}
3320 3386
3321static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3387static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3322{ 3388{
3323 if (flags & BTRFS_BLOCK_GROUP_DATA) 3389 unsigned seq;
3324 flags |= root->fs_info->avail_data_alloc_bits; 3390
3325 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3391 do {
3326 flags |= root->fs_info->avail_system_alloc_bits; 3392 seq = read_seqbegin(&root->fs_info->profiles_lock);
3327 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3393
3328 flags |= root->fs_info->avail_metadata_alloc_bits; 3394 if (flags & BTRFS_BLOCK_GROUP_DATA)
3395 flags |= root->fs_info->avail_data_alloc_bits;
3396 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3397 flags |= root->fs_info->avail_system_alloc_bits;
3398 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3399 flags |= root->fs_info->avail_metadata_alloc_bits;
3400 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3329 3401
3330 return btrfs_reduce_alloc_profile(root, flags); 3402 return btrfs_reduce_alloc_profile(root, flags);
3331} 3403}
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3333u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3405u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3334{ 3406{
3335 u64 flags; 3407 u64 flags;
3408 u64 ret;
3336 3409
3337 if (data) 3410 if (data)
3338 flags = BTRFS_BLOCK_GROUP_DATA; 3411 flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3341 else 3414 else
3342 flags = BTRFS_BLOCK_GROUP_METADATA; 3415 flags = BTRFS_BLOCK_GROUP_METADATA;
3343 3416
3344 return get_alloc_profile(root, flags); 3417 ret = get_alloc_profile(root, flags);
3418 return ret;
3345} 3419}
3346 3420
3347/* 3421/*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3357 int ret = 0, committed = 0, alloc_chunk = 1; 3431 int ret = 0, committed = 0, alloc_chunk = 1;
3358 3432
3359 /* make sure bytes are sectorsize aligned */ 3433 /* make sure bytes are sectorsize aligned */
3360 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3434 bytes = ALIGN(bytes, root->sectorsize);
3361 3435
3362 if (root == root->fs_info->tree_root || 3436 if (root == root->fs_info->tree_root ||
3363 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { 3437 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3452 struct btrfs_space_info *data_sinfo; 3526 struct btrfs_space_info *data_sinfo;
3453 3527
3454 /* make sure bytes are sectorsize aligned */ 3528 /* make sure bytes are sectorsize aligned */
3455 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3529 bytes = ALIGN(bytes, root->sectorsize);
3456 3530
3457 data_sinfo = root->fs_info->data_sinfo; 3531 data_sinfo = root->fs_info->data_sinfo;
3458 spin_lock(&data_sinfo->lock); 3532 spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3516{ 3590{
3517 u64 num_dev; 3591 u64 num_dev;
3518 3592
3519 if (type & BTRFS_BLOCK_GROUP_RAID10 || 3593 if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3520 type & BTRFS_BLOCK_GROUP_RAID0) 3594 BTRFS_BLOCK_GROUP_RAID0 |
3595 BTRFS_BLOCK_GROUP_RAID5 |
3596 BTRFS_BLOCK_GROUP_RAID6))
3521 num_dev = root->fs_info->fs_devices->rw_devices; 3597 num_dev = root->fs_info->fs_devices->rw_devices;
3522 else if (type & BTRFS_BLOCK_GROUP_RAID1) 3598 else if (type & BTRFS_BLOCK_GROUP_RAID1)
3523 num_dev = 2; 3599 num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3564 int wait_for_alloc = 0; 3640 int wait_for_alloc = 0;
3565 int ret = 0; 3641 int ret = 0;
3566 3642
3643 /* Don't re-enter if we're already allocating a chunk */
3644 if (trans->allocating_chunk)
3645 return -ENOSPC;
3646
3567 space_info = __find_space_info(extent_root->fs_info, flags); 3647 space_info = __find_space_info(extent_root->fs_info, flags);
3568 if (!space_info) { 3648 if (!space_info) {
3569 ret = update_space_info(extent_root->fs_info, flags, 3649 ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
3606 goto again; 3686 goto again;
3607 } 3687 }
3608 3688
3689 trans->allocating_chunk = true;
3690
3609 /* 3691 /*
3610 * If we have mixed data/metadata chunks we want to make sure we keep 3692 * If we have mixed data/metadata chunks we want to make sure we keep
3611 * allocating mixed chunks instead of individual chunks. 3693 * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
3632 check_system_chunk(trans, extent_root, flags); 3714 check_system_chunk(trans, extent_root, flags);
3633 3715
3634 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3716 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3635 if (ret < 0 && ret != -ENOSPC) 3717 trans->allocating_chunk = false;
3636 goto out;
3637 3718
3638 spin_lock(&space_info->lock); 3719 spin_lock(&space_info->lock);
3720 if (ret < 0 && ret != -ENOSPC)
3721 goto out;
3639 if (ret) 3722 if (ret)
3640 space_info->full = 1; 3723 space_info->full = 1;
3641 else 3724 else
3642 ret = 1; 3725 ret = 1;
3643 3726
3644 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3727 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3728out:
3645 space_info->chunk_alloc = 0; 3729 space_info->chunk_alloc = 0;
3646 spin_unlock(&space_info->lock); 3730 spin_unlock(&space_info->lock);
3647out:
3648 mutex_unlock(&fs_info->chunk_mutex); 3731 mutex_unlock(&fs_info->chunk_mutex);
3649 return ret; 3732 return ret;
3650} 3733}
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
3653 struct btrfs_space_info *space_info, u64 bytes, 3736 struct btrfs_space_info *space_info, u64 bytes,
3654 enum btrfs_reserve_flush_enum flush) 3737 enum btrfs_reserve_flush_enum flush)
3655{ 3738{
3739 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3656 u64 profile = btrfs_get_alloc_profile(root, 0); 3740 u64 profile = btrfs_get_alloc_profile(root, 0);
3741 u64 rsv_size = 0;
3657 u64 avail; 3742 u64 avail;
3658 u64 used; 3743 u64 used;
3744 u64 to_add;
3659 3745
3660 used = space_info->bytes_used + space_info->bytes_reserved + 3746 used = space_info->bytes_used + space_info->bytes_reserved +
3661 space_info->bytes_pinned + space_info->bytes_readonly + 3747 space_info->bytes_pinned + space_info->bytes_readonly;
3662 space_info->bytes_may_use; 3748
3749 spin_lock(&global_rsv->lock);
3750 rsv_size = global_rsv->size;
3751 spin_unlock(&global_rsv->lock);
3752
3753 /*
3754 * We only want to allow over committing if we have lots of actual space
3755 * free, but if we don't have enough space to handle the global reserve
3756 * space then we could end up having a real enospc problem when trying
3757 * to allocate a chunk or some other such important allocation.
3758 */
3759 rsv_size <<= 1;
3760 if (used + rsv_size >= space_info->total_bytes)
3761 return 0;
3762
3763 used += space_info->bytes_may_use;
3663 3764
3664 spin_lock(&root->fs_info->free_chunk_lock); 3765 spin_lock(&root->fs_info->free_chunk_lock);
3665 avail = root->fs_info->free_chunk_space; 3766 avail = root->fs_info->free_chunk_space;
@@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root,
3667 3768
3668 /* 3769 /*
3669 * If we have dup, raid1 or raid10 then only half of the free 3770 * If we have dup, raid1 or raid10 then only half of the free
3670 * space is actually useable. 3771 * space is actually useable. For raid56, the space info used
3772 * doesn't include the parity drive, so we don't have to
3773 * change the math
3671 */ 3774 */
3672 if (profile & (BTRFS_BLOCK_GROUP_DUP | 3775 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3673 BTRFS_BLOCK_GROUP_RAID1 | 3776 BTRFS_BLOCK_GROUP_RAID1 |
3674 BTRFS_BLOCK_GROUP_RAID10)) 3777 BTRFS_BLOCK_GROUP_RAID10))
3675 avail >>= 1; 3778 avail >>= 1;
3676 3779
3780 to_add = space_info->total_bytes;
3781
3677 /* 3782 /*
3678 * If we aren't flushing all things, let us overcommit up to 3783 * If we aren't flushing all things, let us overcommit up to
3679 * 1/2th of the space. If we can flush, don't let us overcommit 3784 * 1/2th of the space. If we can flush, don't let us overcommit
3680 * too much, let it overcommit up to 1/8 of the space. 3785 * too much, let it overcommit up to 1/8 of the space.
3681 */ 3786 */
3682 if (flush == BTRFS_RESERVE_FLUSH_ALL) 3787 if (flush == BTRFS_RESERVE_FLUSH_ALL)
3683 avail >>= 3; 3788 to_add >>= 3;
3684 else 3789 else
3685 avail >>= 1; 3790 to_add >>= 1;
3686 3791
3687 if (used + bytes < space_info->total_bytes + avail) 3792 /*
3793 * Limit the overcommit to the amount of free space we could possibly
3794 * allocate for chunks.
3795 */
3796 to_add = min(avail, to_add);
3797
3798 if (used + bytes < space_info->total_bytes + to_add)
3688 return 1; 3799 return 1;
3689 return 0; 3800 return 0;
3690} 3801}
3691 3802
3803void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3804 unsigned long nr_pages)
3805{
3806 struct super_block *sb = root->fs_info->sb;
3807 int started;
3808
3809 /* If we can not start writeback, just sync all the delalloc file. */
3810 started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3811 WB_REASON_FS_FREE_SPACE);
3812 if (!started) {
3813 /*
3814 * We needn't worry the filesystem going from r/w to r/o though
3815 * we don't acquire ->s_umount mutex, because the filesystem
3816 * should guarantee the delalloc inodes list be empty after
3817 * the filesystem is readonly(all dirty pages are written to
3818 * the disk).
3819 */
3820 btrfs_start_delalloc_inodes(root, 0);
3821 btrfs_wait_ordered_extents(root, 0);
3822 }
3823}
3824
3692/* 3825/*
3693 * shrink metadata reservation for delalloc 3826 * shrink metadata reservation for delalloc
3694 */ 3827 */
@@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3710 space_info = block_rsv->space_info; 3843 space_info = block_rsv->space_info;
3711 3844
3712 smp_mb(); 3845 smp_mb();
3713 delalloc_bytes = root->fs_info->delalloc_bytes; 3846 delalloc_bytes = percpu_counter_sum_positive(
3847 &root->fs_info->delalloc_bytes);
3714 if (delalloc_bytes == 0) { 3848 if (delalloc_bytes == 0) {
3715 if (trans) 3849 if (trans)
3716 return; 3850 return;
@@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3721 while (delalloc_bytes && loops < 3) { 3855 while (delalloc_bytes && loops < 3) {
3722 max_reclaim = min(delalloc_bytes, to_reclaim); 3856 max_reclaim = min(delalloc_bytes, to_reclaim);
3723 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; 3857 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3724 try_to_writeback_inodes_sb_nr(root->fs_info->sb, 3858 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3725 nr_pages,
3726 WB_REASON_FS_FREE_SPACE);
3727
3728 /* 3859 /*
3729 * We need to wait for the async pages to actually start before 3860 * We need to wait for the async pages to actually start before
3730 * we do anything. 3861 * we do anything.
@@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3752 break; 3883 break;
3753 } 3884 }
3754 smp_mb(); 3885 smp_mb();
3755 delalloc_bytes = root->fs_info->delalloc_bytes; 3886 delalloc_bytes = percpu_counter_sum_positive(
3887 &root->fs_info->delalloc_bytes);
3756 } 3888 }
3757} 3889}
3758 3890
@@ -4016,6 +4148,15 @@ again:
4016 goto again; 4148 goto again;
4017 4149
4018out: 4150out:
4151 if (ret == -ENOSPC &&
4152 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4153 struct btrfs_block_rsv *global_rsv =
4154 &root->fs_info->global_block_rsv;
4155
4156 if (block_rsv != global_rsv &&
4157 !block_rsv_use_bytes(global_rsv, orig_bytes))
4158 ret = 0;
4159 }
4019 if (flushing) { 4160 if (flushing) {
4020 spin_lock(&space_info->lock); 4161 spin_lock(&space_info->lock);
4021 space_info->flush = 0; 4162 space_info->flush = 0;
@@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4402 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4543 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4403} 4544}
4404 4545
4405int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, 4546/*
4406 struct btrfs_pending_snapshot *pending) 4547 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4548 * root: the root of the parent directory
4549 * rsv: block reservation
4550 * items: the number of items that we need do reservation
4551 * qgroup_reserved: used to return the reserved size in qgroup
4552 *
4553 * This function is used to reserve the space for snapshot/subvolume
4554 * creation and deletion. Those operations are different with the
4555 * common file/directory operations, they change two fs/file trees
4556 * and root tree, the number of items that the qgroup reserves is
4557 * different with the free space reservation. So we can not use
4558 * the space reseravtion mechanism in start_transaction().
4559 */
4560int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4561 struct btrfs_block_rsv *rsv,
4562 int items,
4563 u64 *qgroup_reserved)
4407{ 4564{
4408 struct btrfs_root *root = pending->root; 4565 u64 num_bytes;
4409 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); 4566 int ret;
4410 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; 4567
4411 /* 4568 if (root->fs_info->quota_enabled) {
4412 * two for root back/forward refs, two for directory entries, 4569 /* One for parent inode, two for dir entries */
4413 * one for root of the snapshot and one for parent inode. 4570 num_bytes = 3 * root->leafsize;
4414 */ 4571 ret = btrfs_qgroup_reserve(root, num_bytes);
4415 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); 4572 if (ret)
4416 dst_rsv->space_info = src_rsv->space_info; 4573 return ret;
4417 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4574 } else {
4575 num_bytes = 0;
4576 }
4577
4578 *qgroup_reserved = num_bytes;
4579
4580 num_bytes = btrfs_calc_trans_metadata_size(root, items);
4581 rsv->space_info = __find_space_info(root->fs_info,
4582 BTRFS_BLOCK_GROUP_METADATA);
4583 ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4584 BTRFS_RESERVE_FLUSH_ALL);
4585 if (ret) {
4586 if (*qgroup_reserved)
4587 btrfs_qgroup_free(root, *qgroup_reserved);
4588 }
4589
4590 return ret;
4591}
4592
4593void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4594 struct btrfs_block_rsv *rsv,
4595 u64 qgroup_reserved)
4596{
4597 btrfs_block_rsv_release(root, rsv, (u64)-1);
4598 if (qgroup_reserved)
4599 btrfs_qgroup_free(root, qgroup_reserved);
4418} 4600}
4419 4601
4420/** 4602/**
@@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4522 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; 4704 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4523 int ret = 0; 4705 int ret = 0;
4524 bool delalloc_lock = true; 4706 bool delalloc_lock = true;
4707 u64 to_free = 0;
4708 unsigned dropped;
4525 4709
4526 /* If we are a free space inode we need to not flush since we will be in 4710 /* If we are a free space inode we need to not flush since we will be in
4527 * the middle of a transaction commit. We also don't need the delalloc 4711 * the middle of a transaction commit. We also don't need the delalloc
@@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4565 csum_bytes = BTRFS_I(inode)->csum_bytes; 4749 csum_bytes = BTRFS_I(inode)->csum_bytes;
4566 spin_unlock(&BTRFS_I(inode)->lock); 4750 spin_unlock(&BTRFS_I(inode)->lock);
4567 4751
4568 if (root->fs_info->quota_enabled) 4752 if (root->fs_info->quota_enabled) {
4569 ret = btrfs_qgroup_reserve(root, num_bytes + 4753 ret = btrfs_qgroup_reserve(root, num_bytes +
4570 nr_extents * root->leafsize); 4754 nr_extents * root->leafsize);
4755 if (ret)
4756 goto out_fail;
4757 }
4571 4758
4572 /* 4759 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4573 * ret != 0 here means the qgroup reservation failed, we go straight to 4760 if (unlikely(ret)) {
4574 * the shared error handling then. 4761 if (root->fs_info->quota_enabled)
4575 */
4576 if (ret == 0)
4577 ret = reserve_metadata_bytes(root, block_rsv,
4578 to_reserve, flush);
4579
4580 if (ret) {
4581 u64 to_free = 0;
4582 unsigned dropped;
4583
4584 spin_lock(&BTRFS_I(inode)->lock);
4585 dropped = drop_outstanding_extent(inode);
4586 /*
4587 * If the inodes csum_bytes is the same as the original
4588 * csum_bytes then we know we haven't raced with any free()ers
4589 * so we can just reduce our inodes csum bytes and carry on.
4590 * Otherwise we have to do the normal free thing to account for
4591 * the case that the free side didn't free up its reserve
4592 * because of this outstanding reservation.
4593 */
4594 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4595 calc_csum_metadata_size(inode, num_bytes, 0);
4596 else
4597 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4598 spin_unlock(&BTRFS_I(inode)->lock);
4599 if (dropped)
4600 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4601
4602 if (to_free) {
4603 btrfs_block_rsv_release(root, block_rsv, to_free);
4604 trace_btrfs_space_reservation(root->fs_info,
4605 "delalloc",
4606 btrfs_ino(inode),
4607 to_free, 0);
4608 }
4609 if (root->fs_info->quota_enabled) {
4610 btrfs_qgroup_free(root, num_bytes + 4762 btrfs_qgroup_free(root, num_bytes +
4611 nr_extents * root->leafsize); 4763 nr_extents * root->leafsize);
4612 } 4764 goto out_fail;
4613 if (delalloc_lock)
4614 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4615 return ret;
4616 } 4765 }
4617 4766
4618 spin_lock(&BTRFS_I(inode)->lock); 4767 spin_lock(&BTRFS_I(inode)->lock);
@@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4633 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4782 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4634 4783
4635 return 0; 4784 return 0;
4785
4786out_fail:
4787 spin_lock(&BTRFS_I(inode)->lock);
4788 dropped = drop_outstanding_extent(inode);
4789 /*
4790 * If the inodes csum_bytes is the same as the original
4791 * csum_bytes then we know we haven't raced with any free()ers
4792 * so we can just reduce our inodes csum bytes and carry on.
4793 * Otherwise we have to do the normal free thing to account for
4794 * the case that the free side didn't free up its reserve
4795 * because of this outstanding reservation.
4796 */
4797 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4798 calc_csum_metadata_size(inode, num_bytes, 0);
4799 else
4800 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4801 spin_unlock(&BTRFS_I(inode)->lock);
4802 if (dropped)
4803 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4804
4805 if (to_free) {
4806 btrfs_block_rsv_release(root, block_rsv, to_free);
4807 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4808 btrfs_ino(inode), to_free, 0);
4809 }
4810 if (delalloc_lock)
4811 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4812 return ret;
4636} 4813}
4637 4814
4638/** 4815/**
@@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4654 spin_lock(&BTRFS_I(inode)->lock); 4831 spin_lock(&BTRFS_I(inode)->lock);
4655 dropped = drop_outstanding_extent(inode); 4832 dropped = drop_outstanding_extent(inode);
4656 4833
4657 to_free = calc_csum_metadata_size(inode, num_bytes, 0); 4834 if (num_bytes)
4835 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4658 spin_unlock(&BTRFS_I(inode)->lock); 4836 spin_unlock(&BTRFS_I(inode)->lock);
4659 if (dropped > 0) 4837 if (dropped > 0)
4660 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4838 to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4721 btrfs_free_reserved_data_space(inode, num_bytes); 4899 btrfs_free_reserved_data_space(inode, num_bytes);
4722} 4900}
4723 4901
4724static int update_block_group(struct btrfs_trans_handle *trans, 4902static int update_block_group(struct btrfs_root *root,
4725 struct btrfs_root *root,
4726 u64 bytenr, u64 num_bytes, int alloc) 4903 u64 bytenr, u64 num_bytes, int alloc)
4727{ 4904{
4728 struct btrfs_block_group_cache *cache = NULL; 4905 struct btrfs_block_group_cache *cache = NULL;
@@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4759 * space back to the block group, otherwise we will leak space. 4936 * space back to the block group, otherwise we will leak space.
4760 */ 4937 */
4761 if (!alloc && cache->cached == BTRFS_CACHE_NO) 4938 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4762 cache_block_group(cache, trans, NULL, 1); 4939 cache_block_group(cache, 1);
4763 4940
4764 byte_in_group = bytenr - cache->key.objectid; 4941 byte_in_group = bytenr - cache->key.objectid;
4765 WARN_ON(byte_in_group > cache->key.offset); 4942 WARN_ON(byte_in_group > cache->key.offset);
@@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4809 struct btrfs_block_group_cache *cache; 4986 struct btrfs_block_group_cache *cache;
4810 u64 bytenr; 4987 u64 bytenr;
4811 4988
4989 spin_lock(&root->fs_info->block_group_cache_lock);
4990 bytenr = root->fs_info->first_logical_byte;
4991 spin_unlock(&root->fs_info->block_group_cache_lock);
4992
4993 if (bytenr < (u64)-1)
4994 return bytenr;
4995
4812 cache = btrfs_lookup_first_block_group(root->fs_info, search_start); 4996 cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4813 if (!cache) 4997 if (!cache)
4814 return 0; 4998 return 0;
@@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
4859/* 5043/*
4860 * this function must be called within transaction 5044 * this function must be called within transaction
4861 */ 5045 */
4862int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, 5046int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4863 struct btrfs_root *root,
4864 u64 bytenr, u64 num_bytes) 5047 u64 bytenr, u64 num_bytes)
4865{ 5048{
4866 struct btrfs_block_group_cache *cache; 5049 struct btrfs_block_group_cache *cache;
@@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4874 * to one because the slow code to read in the free extents does check 5057 * to one because the slow code to read in the free extents does check
4875 * the pinned extents. 5058 * the pinned extents.
4876 */ 5059 */
4877 cache_block_group(cache, trans, root, 1); 5060 cache_block_group(cache, 1);
4878 5061
4879 pin_down_extent(root, cache, bytenr, num_bytes, 0); 5062 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4880 5063
@@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5271 } 5454 }
5272 } 5455 }
5273 5456
5274 ret = update_block_group(trans, root, bytenr, num_bytes, 0); 5457 ret = update_block_group(root, bytenr, num_bytes, 0);
5275 if (ret) { 5458 if (ret) {
5276 btrfs_abort_transaction(trans, extent_root, ret); 5459 btrfs_abort_transaction(trans, extent_root, ret);
5277 goto out; 5460 goto out;
@@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5316 if (head->extent_op) { 5499 if (head->extent_op) {
5317 if (!head->must_insert_reserved) 5500 if (!head->must_insert_reserved)
5318 goto out; 5501 goto out;
5319 kfree(head->extent_op); 5502 btrfs_free_delayed_extent_op(head->extent_op);
5320 head->extent_op = NULL; 5503 head->extent_op = NULL;
5321 } 5504 }
5322 5505
@@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5439 return ret; 5622 return ret;
5440} 5623}
5441 5624
5442static u64 stripe_align(struct btrfs_root *root, u64 val) 5625static u64 stripe_align(struct btrfs_root *root,
5626 struct btrfs_block_group_cache *cache,
5627 u64 val, u64 num_bytes)
5443{ 5628{
5444 u64 mask = ((u64)root->stripesize - 1); 5629 u64 ret = ALIGN(val, root->stripesize);
5445 u64 ret = (val + mask) & ~mask;
5446 return ret; 5630 return ret;
5447} 5631}
5448 5632
@@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5462 u64 num_bytes) 5646 u64 num_bytes)
5463{ 5647{
5464 struct btrfs_caching_control *caching_ctl; 5648 struct btrfs_caching_control *caching_ctl;
5465 DEFINE_WAIT(wait);
5466 5649
5467 caching_ctl = get_caching_control(cache); 5650 caching_ctl = get_caching_control(cache);
5468 if (!caching_ctl) 5651 if (!caching_ctl)
@@ -5479,7 +5662,6 @@ static noinline int
5479wait_block_group_cache_done(struct btrfs_block_group_cache *cache) 5662wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5480{ 5663{
5481 struct btrfs_caching_control *caching_ctl; 5664 struct btrfs_caching_control *caching_ctl;
5482 DEFINE_WAIT(wait);
5483 5665
5484 caching_ctl = get_caching_control(cache); 5666 caching_ctl = get_caching_control(cache);
5485 if (!caching_ctl) 5667 if (!caching_ctl)
@@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5493 5675
5494int __get_raid_index(u64 flags) 5676int __get_raid_index(u64 flags)
5495{ 5677{
5496 int index;
5497
5498 if (flags & BTRFS_BLOCK_GROUP_RAID10) 5678 if (flags & BTRFS_BLOCK_GROUP_RAID10)
5499 index = 0; 5679 return BTRFS_RAID_RAID10;
5500 else if (flags & BTRFS_BLOCK_GROUP_RAID1) 5680 else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5501 index = 1; 5681 return BTRFS_RAID_RAID1;
5502 else if (flags & BTRFS_BLOCK_GROUP_DUP) 5682 else if (flags & BTRFS_BLOCK_GROUP_DUP)
5503 index = 2; 5683 return BTRFS_RAID_DUP;
5504 else if (flags & BTRFS_BLOCK_GROUP_RAID0) 5684 else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5505 index = 3; 5685 return BTRFS_RAID_RAID0;
5506 else 5686 else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5507 index = 4; 5687 return BTRFS_RAID_RAID5;
5688 else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5689 return BTRFS_RAID_RAID6;
5508 5690
5509 return index; 5691 return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5510} 5692}
5511 5693
5512static int get_block_group_index(struct btrfs_block_group_cache *cache) 5694static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5649,6 +5831,8 @@ search:
5649 if (!block_group_bits(block_group, data)) { 5831 if (!block_group_bits(block_group, data)) {
5650 u64 extra = BTRFS_BLOCK_GROUP_DUP | 5832 u64 extra = BTRFS_BLOCK_GROUP_DUP |
5651 BTRFS_BLOCK_GROUP_RAID1 | 5833 BTRFS_BLOCK_GROUP_RAID1 |
5834 BTRFS_BLOCK_GROUP_RAID5 |
5835 BTRFS_BLOCK_GROUP_RAID6 |
5652 BTRFS_BLOCK_GROUP_RAID10; 5836 BTRFS_BLOCK_GROUP_RAID10;
5653 5837
5654 /* 5838 /*
@@ -5664,8 +5848,7 @@ have_block_group:
5664 cached = block_group_cache_done(block_group); 5848 cached = block_group_cache_done(block_group);
5665 if (unlikely(!cached)) { 5849 if (unlikely(!cached)) {
5666 found_uncached_bg = true; 5850 found_uncached_bg = true;
5667 ret = cache_block_group(block_group, trans, 5851 ret = cache_block_group(block_group, 0);
5668 orig_root, 0);
5669 BUG_ON(ret < 0); 5852 BUG_ON(ret < 0);
5670 ret = 0; 5853 ret = 0;
5671 } 5854 }
@@ -5678,6 +5861,7 @@ have_block_group:
5678 * lets look there 5861 * lets look there
5679 */ 5862 */
5680 if (last_ptr) { 5863 if (last_ptr) {
5864 unsigned long aligned_cluster;
5681 /* 5865 /*
5682 * the refill lock keeps out other 5866 * the refill lock keeps out other
5683 * people trying to start a new cluster 5867 * people trying to start a new cluster
@@ -5744,11 +5928,15 @@ refill_cluster:
5744 goto unclustered_alloc; 5928 goto unclustered_alloc;
5745 } 5929 }
5746 5930
5931 aligned_cluster = max_t(unsigned long,
5932 empty_cluster + empty_size,
5933 block_group->full_stripe_len);
5934
5747 /* allocate a cluster in this block group */ 5935 /* allocate a cluster in this block group */
5748 ret = btrfs_find_space_cluster(trans, root, 5936 ret = btrfs_find_space_cluster(trans, root,
5749 block_group, last_ptr, 5937 block_group, last_ptr,
5750 search_start, num_bytes, 5938 search_start, num_bytes,
5751 empty_cluster + empty_size); 5939 aligned_cluster);
5752 if (ret == 0) { 5940 if (ret == 0) {
5753 /* 5941 /*
5754 * now pull our allocation out of this 5942 * now pull our allocation out of this
@@ -5819,7 +6007,8 @@ unclustered_alloc:
5819 goto loop; 6007 goto loop;
5820 } 6008 }
5821checks: 6009checks:
5822 search_start = stripe_align(root, offset); 6010 search_start = stripe_align(root, used_block_group,
6011 offset, num_bytes);
5823 6012
5824 /* move on to the next group */ 6013 /* move on to the next group */
5825 if (search_start + num_bytes > 6014 if (search_start + num_bytes >
@@ -5970,7 +6159,7 @@ again:
5970 if (ret == -ENOSPC) { 6159 if (ret == -ENOSPC) {
5971 if (!final_tried) { 6160 if (!final_tried) {
5972 num_bytes = num_bytes >> 1; 6161 num_bytes = num_bytes >> 1;
5973 num_bytes = num_bytes & ~(root->sectorsize - 1); 6162 num_bytes = round_down(num_bytes, root->sectorsize);
5974 num_bytes = max(num_bytes, min_alloc_size); 6163 num_bytes = max(num_bytes, min_alloc_size);
5975 if (num_bytes == min_alloc_size) 6164 if (num_bytes == min_alloc_size)
5976 final_tried = true; 6165 final_tried = true;
@@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6094 btrfs_mark_buffer_dirty(path->nodes[0]); 6283 btrfs_mark_buffer_dirty(path->nodes[0]);
6095 btrfs_free_path(path); 6284 btrfs_free_path(path);
6096 6285
6097 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6286 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6098 if (ret) { /* -ENOENT, logic error */ 6287 if (ret) { /* -ENOENT, logic error */
6099 printk(KERN_ERR "btrfs update block group failed for %llu " 6288 printk(KERN_ERR "btrfs update block group failed for %llu "
6100 "%llu\n", (unsigned long long)ins->objectid, 6289 "%llu\n", (unsigned long long)ins->objectid,
@@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6158 btrfs_mark_buffer_dirty(leaf); 6347 btrfs_mark_buffer_dirty(leaf);
6159 btrfs_free_path(path); 6348 btrfs_free_path(path);
6160 6349
6161 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); 6350 ret = update_block_group(root, ins->objectid, ins->offset, 1);
6162 if (ret) { /* -ENOENT, logic error */ 6351 if (ret) { /* -ENOENT, logic error */
6163 printk(KERN_ERR "btrfs update block group failed for %llu " 6352 printk(KERN_ERR "btrfs update block group failed for %llu "
6164 "%llu\n", (unsigned long long)ins->objectid, 6353 "%llu\n", (unsigned long long)ins->objectid,
@@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6201 u64 num_bytes = ins->offset; 6390 u64 num_bytes = ins->offset;
6202 6391
6203 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 6392 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6204 cache_block_group(block_group, trans, NULL, 0); 6393 cache_block_group(block_group, 0);
6205 caching_ctl = get_caching_control(block_group); 6394 caching_ctl = get_caching_control(block_group);
6206 6395
6207 if (!caching_ctl) { 6396 if (!caching_ctl) {
@@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
6315 if (!ret) 6504 if (!ret)
6316 return block_rsv; 6505 return block_rsv;
6317 if (ret && !block_rsv->failfast) { 6506 if (ret && !block_rsv->failfast) {
6318 static DEFINE_RATELIMIT_STATE(_rs, 6507 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6319 DEFAULT_RATELIMIT_INTERVAL, 6508 static DEFINE_RATELIMIT_STATE(_rs,
6320 /*DEFAULT_RATELIMIT_BURST*/ 2); 6509 DEFAULT_RATELIMIT_INTERVAL * 10,
6321 if (__ratelimit(&_rs)) 6510 /*DEFAULT_RATELIMIT_BURST*/ 1);
6322 WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", 6511 if (__ratelimit(&_rs))
6323 ret); 6512 WARN(1, KERN_DEBUG
6513 "btrfs: block rsv returned %d\n", ret);
6514 }
6324 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 6515 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6325 BTRFS_RESERVE_NO_FLUSH); 6516 BTRFS_RESERVE_NO_FLUSH);
6326 if (!ret) { 6517 if (!ret) {
@@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6386 6577
6387 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { 6578 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6388 struct btrfs_delayed_extent_op *extent_op; 6579 struct btrfs_delayed_extent_op *extent_op;
6389 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); 6580 extent_op = btrfs_alloc_delayed_extent_op();
6390 BUG_ON(!extent_op); /* -ENOMEM */ 6581 BUG_ON(!extent_op); /* -ENOMEM */
6391 if (key) 6582 if (key)
6392 memcpy(&extent_op->key, key, sizeof(extent_op->key)); 6583 memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7189 root->fs_info->fs_devices->missing_devices; 7380 root->fs_info->fs_devices->missing_devices;
7190 7381
7191 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7382 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7383 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7192 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7384 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7193 7385
7194 if (num_devices == 1) { 7386 if (num_devices == 1) {
@@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7467 index = get_block_group_index(block_group); 7659 index = get_block_group_index(block_group);
7468 } 7660 }
7469 7661
7470 if (index == 0) { 7662 if (index == BTRFS_RAID_RAID10) {
7471 dev_min = 4; 7663 dev_min = 4;
7472 /* Divide by 2 */ 7664 /* Divide by 2 */
7473 min_free >>= 1; 7665 min_free >>= 1;
7474 } else if (index == 1) { 7666 } else if (index == BTRFS_RAID_RAID1) {
7475 dev_min = 2; 7667 dev_min = 2;
7476 } else if (index == 2) { 7668 } else if (index == BTRFS_RAID_DUP) {
7477 /* Multiply by 2 */ 7669 /* Multiply by 2 */
7478 min_free <<= 1; 7670 min_free <<= 1;
7479 } else if (index == 3) { 7671 } else if (index == BTRFS_RAID_RAID0) {
7480 dev_min = fs_devices->rw_devices; 7672 dev_min = fs_devices->rw_devices;
7481 do_div(min_free, dev_min); 7673 do_div(min_free, dev_min);
7482 } 7674 }
@@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7637 space_info = list_entry(info->space_info.next, 7829 space_info = list_entry(info->space_info.next,
7638 struct btrfs_space_info, 7830 struct btrfs_space_info,
7639 list); 7831 list);
7640 if (space_info->bytes_pinned > 0 || 7832 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
7641 space_info->bytes_reserved > 0 || 7833 if (space_info->bytes_pinned > 0 ||
7642 space_info->bytes_may_use > 0) { 7834 space_info->bytes_reserved > 0 ||
7643 WARN_ON(1); 7835 space_info->bytes_may_use > 0) {
7644 dump_space_info(space_info, 0, 0); 7836 WARN_ON(1);
7837 dump_space_info(space_info, 0, 0);
7838 }
7645 } 7839 }
7646 list_del(&space_info->list); 7840 list_del(&space_info->list);
7647 kfree(space_info); 7841 kfree(space_info);
@@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7740 btrfs_release_path(path); 7934 btrfs_release_path(path);
7741 cache->flags = btrfs_block_group_flags(&cache->item); 7935 cache->flags = btrfs_block_group_flags(&cache->item);
7742 cache->sectorsize = root->sectorsize; 7936 cache->sectorsize = root->sectorsize;
7743 7937 cache->full_stripe_len = btrfs_full_stripe_len(root,
7938 &root->fs_info->mapping_tree,
7939 found_key.objectid);
7744 btrfs_init_free_space_ctl(cache); 7940 btrfs_init_free_space_ctl(cache);
7745 7941
7746 /* 7942 /*
@@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7794 if (!(get_alloc_profile(root, space_info->flags) & 7990 if (!(get_alloc_profile(root, space_info->flags) &
7795 (BTRFS_BLOCK_GROUP_RAID10 | 7991 (BTRFS_BLOCK_GROUP_RAID10 |
7796 BTRFS_BLOCK_GROUP_RAID1 | 7992 BTRFS_BLOCK_GROUP_RAID1 |
7993 BTRFS_BLOCK_GROUP_RAID5 |
7994 BTRFS_BLOCK_GROUP_RAID6 |
7797 BTRFS_BLOCK_GROUP_DUP))) 7995 BTRFS_BLOCK_GROUP_DUP)))
7798 continue; 7996 continue;
7799 /* 7997 /*
@@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7869 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8067 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7870 cache->sectorsize = root->sectorsize; 8068 cache->sectorsize = root->sectorsize;
7871 cache->fs_info = root->fs_info; 8069 cache->fs_info = root->fs_info;
8070 cache->full_stripe_len = btrfs_full_stripe_len(root,
8071 &root->fs_info->mapping_tree,
8072 chunk_offset);
7872 8073
7873 atomic_set(&cache->count, 1); 8074 atomic_set(&cache->count, 1);
7874 spin_lock_init(&cache->lock); 8075 spin_lock_init(&cache->lock);
@@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7918 u64 extra_flags = chunk_to_extended(flags) & 8119 u64 extra_flags = chunk_to_extended(flags) &
7919 BTRFS_EXTENDED_PROFILE_MASK; 8120 BTRFS_EXTENDED_PROFILE_MASK;
7920 8121
8122 write_seqlock(&fs_info->profiles_lock);
7921 if (flags & BTRFS_BLOCK_GROUP_DATA) 8123 if (flags & BTRFS_BLOCK_GROUP_DATA)
7922 fs_info->avail_data_alloc_bits &= ~extra_flags; 8124 fs_info->avail_data_alloc_bits &= ~extra_flags;
7923 if (flags & BTRFS_BLOCK_GROUP_METADATA) 8125 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7924 fs_info->avail_metadata_alloc_bits &= ~extra_flags; 8126 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7925 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 8127 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7926 fs_info->avail_system_alloc_bits &= ~extra_flags; 8128 fs_info->avail_system_alloc_bits &= ~extra_flags;
8129 write_sequnlock(&fs_info->profiles_lock);
7927} 8130}
7928 8131
7929int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 8132int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8022 spin_lock(&root->fs_info->block_group_cache_lock); 8225 spin_lock(&root->fs_info->block_group_cache_lock);
8023 rb_erase(&block_group->cache_node, 8226 rb_erase(&block_group->cache_node,
8024 &root->fs_info->block_group_cache_tree); 8227 &root->fs_info->block_group_cache_tree);
8228
8229 if (root->fs_info->first_logical_byte == block_group->key.objectid)
8230 root->fs_info->first_logical_byte = (u64)-1;
8025 spin_unlock(&root->fs_info->block_group_cache_lock); 8231 spin_unlock(&root->fs_info->block_group_cache_lock);
8026 8232
8027 down_write(&block_group->space_info->groups_sem); 8233 down_write(&block_group->space_info->groups_sem);
@@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8144 8350
8145 if (end - start >= range->minlen) { 8351 if (end - start >= range->minlen) {
8146 if (!block_group_cache_done(cache)) { 8352 if (!block_group_cache_done(cache)) {
8147 ret = cache_block_group(cache, NULL, root, 0); 8353 ret = cache_block_group(cache, 0);
8148 if (!ret) 8354 if (!ret)
8149 wait_block_group_cache_done(cache); 8355 wait_block_group_cache_done(cache);
8150 } 8356 }