aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2014-01-23 10:54:11 -0500
committerChris Mason <clm@fb.com>2014-01-28 16:20:26 -0500
commit0a2b2a844af616addc87cac3cc18dcaba2a9d0fb (patch)
treed81e13b3388df4a66e3a2af6ff2df82f532d5c9e /fs
parentd7df2c796d7eedd72a334dc89c65e1fec8171431 (diff)
Btrfs: throttle delayed refs better
On one of our gluster clusters we noticed some pretty big lag spikes. This turned out to be because our transaction commit was taking like 3 minutes to complete. This is because we have like 30 gigs of metadata, so our global reserve would end up being the max which is like 512 mb. So our throttling code would allow a ridiculous amount of delayed refs to build up and then they'd all get run at transaction commit time, and for a cold mounted file system that could take up to 3 minutes to run. So fix the throttling to be based on both the size of the global reserve and how long it takes us to run delayed refs. This patch tracks the time it takes to run delayed refs and then only allows 1 seconds worth of outstanding delayed refs at a time. This way it will auto-tune itself from cold cache up to when everything is in memory and it no longer has to go to disk. This makes our transaction commits take much less time to run. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/ctree.h3
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c41
-rw-r--r--fs/btrfs/transaction.c4
4 files changed, 46 insertions, 4 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3cebb4aeddc7..ca6bcc33d033 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1360,6 +1360,7 @@ struct btrfs_fs_info {
1360 1360
1361 u64 generation; 1361 u64 generation;
1362 u64 last_trans_committed; 1362 u64 last_trans_committed;
1363 u64 avg_delayed_ref_runtime;
1363 1364
1364 /* 1365 /*
1365 * this is updated to the current trans every time a full commit 1366 * this is updated to the current trans every time a full commit
@@ -3172,6 +3173,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3172 3173
3173int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 3174int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3174 struct btrfs_root *root); 3175 struct btrfs_root *root);
3176int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3177 struct btrfs_root *root);
3175void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3178void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3176int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 3179int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3177 struct btrfs_root *root, unsigned long count); 3180 struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed23127a4b02..f0e7bbe14823 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2185,7 +2185,7 @@ int open_ctree(struct super_block *sb,
2185 fs_info->free_chunk_space = 0; 2185 fs_info->free_chunk_space = 0;
2186 fs_info->tree_mod_log = RB_ROOT; 2186 fs_info->tree_mod_log = RB_ROOT;
2187 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 2187 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2188 2188 fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
2189 /* readahead state */ 2189 /* readahead state */
2190 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2190 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
2191 spin_lock_init(&fs_info->reada_lock); 2191 spin_lock_init(&fs_info->reada_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c77156c77de7..b5322596d60b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2322 struct btrfs_delayed_ref_head *locked_ref = NULL; 2322 struct btrfs_delayed_ref_head *locked_ref = NULL;
2323 struct btrfs_delayed_extent_op *extent_op; 2323 struct btrfs_delayed_extent_op *extent_op;
2324 struct btrfs_fs_info *fs_info = root->fs_info; 2324 struct btrfs_fs_info *fs_info = root->fs_info;
2325 ktime_t start = ktime_get();
2325 int ret; 2326 int ret;
2326 unsigned long count = 0; 2327 unsigned long count = 0;
2328 unsigned long actual_count = 0;
2327 int must_insert_reserved = 0; 2329 int must_insert_reserved = 0;
2328 2330
2329 delayed_refs = &trans->transaction->delayed_refs; 2331 delayed_refs = &trans->transaction->delayed_refs;
@@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2452 &delayed_refs->href_root); 2454 &delayed_refs->href_root);
2453 spin_unlock(&delayed_refs->lock); 2455 spin_unlock(&delayed_refs->lock);
2454 } else { 2456 } else {
2457 actual_count++;
2455 ref->in_tree = 0; 2458 ref->in_tree = 0;
2456 rb_erase(&ref->rb_node, &locked_ref->ref_root); 2459 rb_erase(&ref->rb_node, &locked_ref->ref_root);
2457 } 2460 }
@@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2502 count++; 2505 count++;
2503 cond_resched(); 2506 cond_resched();
2504 } 2507 }
2508
2509 /*
2510 * We don't want to include ref heads since we can have empty ref heads
2511 * and those will drastically skew our runtime down since we just do
2512 * accounting, no actual extent tree updates.
2513 */
2514 if (actual_count > 0) {
2515 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2516 u64 avg;
2517
2518 /*
2519 * We weigh the current average higher than our current runtime
2520 * to avoid large swings in the average.
2521 */
2522 spin_lock(&delayed_refs->lock);
2523 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2524 avg = div64_u64(avg, 4);
2525 fs_info->avg_delayed_ref_runtime = avg;
2526 spin_unlock(&delayed_refs->lock);
2527 }
2505 return 0; 2528 return 0;
2506} 2529}
2507 2530
@@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2600 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2623 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2601} 2624}
2602 2625
2603int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 2626int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2604 struct btrfs_root *root) 2627 struct btrfs_root *root)
2605{ 2628{
2606 struct btrfs_block_rsv *global_rsv; 2629 struct btrfs_block_rsv *global_rsv;
@@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2629 return ret; 2652 return ret;
2630} 2653}
2631 2654
2655int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
2656 struct btrfs_root *root)
2657{
2658 struct btrfs_fs_info *fs_info = root->fs_info;
2659 u64 num_entries =
2660 atomic_read(&trans->transaction->delayed_refs.num_entries);
2661 u64 avg_runtime;
2662
2663 smp_mb();
2664 avg_runtime = fs_info->avg_delayed_ref_runtime;
2665 if (num_entries * avg_runtime >= NSEC_PER_SEC)
2666 return 1;
2667
2668 return btrfs_check_space_for_delayed_refs(trans, root);
2669}
2670
2632/* 2671/*
2633 * this starts processing the delayed reference count updates and 2672 * this starts processing the delayed reference count updates and
2634 * extent insertions we have queued up so far. count can be 2673 * extent insertions we have queued up so far. count can be
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index fd1446496fe8..5e2bfdaf8d14 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -645,7 +645,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
645 struct btrfs_root *root) 645 struct btrfs_root *root)
646{ 646{
647 if (root->fs_info->global_block_rsv.space_info->full && 647 if (root->fs_info->global_block_rsv.space_info->full &&
648 btrfs_should_throttle_delayed_refs(trans, root)) 648 btrfs_check_space_for_delayed_refs(trans, root))
649 return 1; 649 return 1;
650 650
651 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); 651 return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -710,7 +710,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
710 710
711 trans->delayed_ref_updates = 0; 711 trans->delayed_ref_updates = 0;
712 if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) { 712 if (!trans->sync && btrfs_should_throttle_delayed_refs(trans, root)) {
713 cur = max_t(unsigned long, cur, 1); 713 cur = max_t(unsigned long, cur, 32);
714 trans->delayed_ref_updates = 0; 714 trans->delayed_ref_updates = 0;
715 btrfs_run_delayed_refs(trans, root, cur); 715 btrfs_run_delayed_refs(trans, root, cur);
716 } 716 }