aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2014-01-23 09:21:38 -0500
committerChris Mason <clm@fb.com>2014-01-28 16:20:25 -0500
commitd7df2c796d7eedd72a334dc89c65e1fec8171431 (patch)
tree63e3adda6e56db27b13d0df28840a873ceac5855 /fs
parent5039eddc19aee8c894191c24f2dde4e645ca1bbb (diff)
Btrfs: attach delayed ref updates to delayed ref heads
Currently we have two rb-trees, one for delayed ref heads and one for all of the delayed refs, including the delayed ref heads. When we process the delayed refs we have to hold onto the delayed ref lock for all of the selecting and merging and such, which results in quite a bit of lock contention. This was solved by having a waitqueue and only one flusher at a time, however this hurts if we get a lot of delayed refs queued up. So instead just have an rb tree for the delayed ref heads, and then attach the delayed ref updates to an rb tree that is per delayed ref head. Then we only need to take the delayed ref lock when adding new delayed refs and when selecting a delayed ref head to process, all the rest of the time we deal with a per delayed ref head lock which will be much less contentious. The locking rules for this get a little more complicated since we have to lock up to 3 things to properly process delayed refs, but I will address that problem later. For now this passes all of xfstests and my overnight stress tests. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/backref.c23
-rw-r--r--fs/btrfs/delayed-ref.c223
-rw-r--r--fs/btrfs/delayed-ref.h23
-rw-r--r--fs/btrfs/disk-io.c79
-rw-r--r--fs/btrfs/extent-tree.c317
-rw-r--r--fs/btrfs/transaction.c7
6 files changed, 267 insertions, 405 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 835b6c9a26a8..34a8952de8dd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -538,14 +538,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
538 if (extent_op && extent_op->update_key) 538 if (extent_op && extent_op->update_key)
539 btrfs_disk_key_to_cpu(&op_key, &extent_op->key); 539 btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
540 540
541 while ((n = rb_prev(n))) { 541 spin_lock(&head->lock);
542 n = rb_first(&head->ref_root);
543 while (n) {
542 struct btrfs_delayed_ref_node *node; 544 struct btrfs_delayed_ref_node *node;
543 node = rb_entry(n, struct btrfs_delayed_ref_node, 545 node = rb_entry(n, struct btrfs_delayed_ref_node,
544 rb_node); 546 rb_node);
545 if (node->bytenr != head->node.bytenr) 547 n = rb_next(n);
546 break;
547 WARN_ON(node->is_head);
548
549 if (node->seq > seq) 548 if (node->seq > seq)
550 continue; 549 continue;
551 550
@@ -612,10 +611,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
612 WARN_ON(1); 611 WARN_ON(1);
613 } 612 }
614 if (ret) 613 if (ret)
615 return ret; 614 break;
616 } 615 }
617 616 spin_unlock(&head->lock);
618 return 0; 617 return ret;
619} 618}
620 619
621/* 620/*
@@ -882,15 +881,15 @@ again:
882 btrfs_put_delayed_ref(&head->node); 881 btrfs_put_delayed_ref(&head->node);
883 goto again; 882 goto again;
884 } 883 }
884 spin_unlock(&delayed_refs->lock);
885 ret = __add_delayed_refs(head, time_seq, 885 ret = __add_delayed_refs(head, time_seq,
886 &prefs_delayed); 886 &prefs_delayed);
887 mutex_unlock(&head->mutex); 887 mutex_unlock(&head->mutex);
888 if (ret) { 888 if (ret)
889 spin_unlock(&delayed_refs->lock);
890 goto out; 889 goto out;
891 } 890 } else {
891 spin_unlock(&delayed_refs->lock);
892 } 892 }
893 spin_unlock(&delayed_refs->lock);
894 } 893 }
895 894
896 if (path->slots[0]) { 895 if (path->slots[0]) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index fab60c1ba065..f3bff89eecf0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -269,39 +269,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
269 269
270static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, 270static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
271 struct btrfs_delayed_ref_root *delayed_refs, 271 struct btrfs_delayed_ref_root *delayed_refs,
272 struct btrfs_delayed_ref_head *head,
272 struct btrfs_delayed_ref_node *ref) 273 struct btrfs_delayed_ref_node *ref)
273{ 274{
274 rb_erase(&ref->rb_node, &delayed_refs->root);
275 if (btrfs_delayed_ref_is_head(ref)) { 275 if (btrfs_delayed_ref_is_head(ref)) {
276 struct btrfs_delayed_ref_head *head;
277
278 head = btrfs_delayed_node_to_head(ref); 276 head = btrfs_delayed_node_to_head(ref);
279 rb_erase(&head->href_node, &delayed_refs->href_root); 277 rb_erase(&head->href_node, &delayed_refs->href_root);
278 } else {
279 assert_spin_locked(&head->lock);
280 rb_erase(&ref->rb_node, &head->ref_root);
280 } 281 }
281 ref->in_tree = 0; 282 ref->in_tree = 0;
282 btrfs_put_delayed_ref(ref); 283 btrfs_put_delayed_ref(ref);
283 delayed_refs->num_entries--; 284 atomic_dec(&delayed_refs->num_entries);
284 if (trans->delayed_ref_updates) 285 if (trans->delayed_ref_updates)
285 trans->delayed_ref_updates--; 286 trans->delayed_ref_updates--;
286} 287}
287 288
288static int merge_ref(struct btrfs_trans_handle *trans, 289static int merge_ref(struct btrfs_trans_handle *trans,
289 struct btrfs_delayed_ref_root *delayed_refs, 290 struct btrfs_delayed_ref_root *delayed_refs,
291 struct btrfs_delayed_ref_head *head,
290 struct btrfs_delayed_ref_node *ref, u64 seq) 292 struct btrfs_delayed_ref_node *ref, u64 seq)
291{ 293{
292 struct rb_node *node; 294 struct rb_node *node;
293 int merged = 0;
294 int mod = 0; 295 int mod = 0;
295 int done = 0; 296 int done = 0;
296 297
297 node = rb_prev(&ref->rb_node); 298 node = rb_next(&ref->rb_node);
298 while (node) { 299 while (!done && node) {
299 struct btrfs_delayed_ref_node *next; 300 struct btrfs_delayed_ref_node *next;
300 301
301 next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 302 next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
302 node = rb_prev(node); 303 node = rb_next(node);
303 if (next->bytenr != ref->bytenr)
304 break;
305 if (seq && next->seq >= seq) 304 if (seq && next->seq >= seq)
306 break; 305 break;
307 if (comp_entry(ref, next, 0)) 306 if (comp_entry(ref, next, 0))
@@ -321,12 +320,11 @@ static int merge_ref(struct btrfs_trans_handle *trans,
321 mod = -next->ref_mod; 320 mod = -next->ref_mod;
322 } 321 }
323 322
324 merged++; 323 drop_delayed_ref(trans, delayed_refs, head, next);
325 drop_delayed_ref(trans, delayed_refs, next);
326 ref->ref_mod += mod; 324 ref->ref_mod += mod;
327 if (ref->ref_mod == 0) { 325 if (ref->ref_mod == 0) {
328 drop_delayed_ref(trans, delayed_refs, ref); 326 drop_delayed_ref(trans, delayed_refs, head, ref);
329 break; 327 done = 1;
330 } else { 328 } else {
331 /* 329 /*
332 * You can't have multiples of the same ref on a tree 330 * You can't have multiples of the same ref on a tree
@@ -335,13 +333,8 @@ static int merge_ref(struct btrfs_trans_handle *trans,
335 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || 333 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
336 ref->type == BTRFS_SHARED_BLOCK_REF_KEY); 334 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
337 } 335 }
338
339 if (done)
340 break;
341 node = rb_prev(&ref->rb_node);
342 } 336 }
343 337 return done;
344 return merged;
345} 338}
346 339
347void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, 340void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -352,6 +345,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
352 struct rb_node *node; 345 struct rb_node *node;
353 u64 seq = 0; 346 u64 seq = 0;
354 347
348 assert_spin_locked(&head->lock);
355 /* 349 /*
356 * We don't have too much refs to merge in the case of delayed data 350 * We don't have too much refs to merge in the case of delayed data
357 * refs. 351 * refs.
@@ -369,22 +363,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
369 } 363 }
370 spin_unlock(&fs_info->tree_mod_seq_lock); 364 spin_unlock(&fs_info->tree_mod_seq_lock);
371 365
372 node = rb_prev(&head->node.rb_node); 366 node = rb_first(&head->ref_root);
373 while (node) { 367 while (node) {
374 struct btrfs_delayed_ref_node *ref; 368 struct btrfs_delayed_ref_node *ref;
375 369
376 ref = rb_entry(node, struct btrfs_delayed_ref_node, 370 ref = rb_entry(node, struct btrfs_delayed_ref_node,
377 rb_node); 371 rb_node);
378 if (ref->bytenr != head->node.bytenr)
379 break;
380
381 /* We can't merge refs that are outside of our seq count */ 372 /* We can't merge refs that are outside of our seq count */
382 if (seq && ref->seq >= seq) 373 if (seq && ref->seq >= seq)
383 break; 374 break;
384 if (merge_ref(trans, delayed_refs, ref, seq)) 375 if (merge_ref(trans, delayed_refs, head, ref, seq))
385 node = rb_prev(&head->node.rb_node); 376 node = rb_first(&head->ref_root);
386 else 377 else
387 node = rb_prev(node); 378 node = rb_next(&ref->rb_node);
388 } 379 }
389} 380}
390 381
@@ -412,64 +403,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
412 return ret; 403 return ret;
413} 404}
414 405
415int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 406struct btrfs_delayed_ref_head *
416 struct list_head *cluster, u64 start) 407btrfs_select_ref_head(struct btrfs_trans_handle *trans)
417{ 408{
418 int count = 0;
419 struct btrfs_delayed_ref_root *delayed_refs; 409 struct btrfs_delayed_ref_root *delayed_refs;
420 struct rb_node *node; 410 struct btrfs_delayed_ref_head *head;
421 struct btrfs_delayed_ref_head *head = NULL; 411 u64 start;
412 bool loop = false;
422 413
423 delayed_refs = &trans->transaction->delayed_refs; 414 delayed_refs = &trans->transaction->delayed_refs;
424 node = rb_first(&delayed_refs->href_root);
425 415
426 if (start) {
427 find_ref_head(&delayed_refs->href_root, start + 1, &head, 1);
428 if (head)
429 node = &head->href_node;
430 }
431again: 416again:
432 while (node && count < 32) { 417 start = delayed_refs->run_delayed_start;
433 head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); 418 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
434 if (list_empty(&head->cluster)) { 419 if (!head && !loop) {
435 list_add_tail(&head->cluster, cluster); 420 delayed_refs->run_delayed_start = 0;
436 delayed_refs->run_delayed_start =
437 head->node.bytenr;
438 count++;
439
440 WARN_ON(delayed_refs->num_heads_ready == 0);
441 delayed_refs->num_heads_ready--;
442 } else if (count) {
443 /* the goal of the clustering is to find extents
444 * that are likely to end up in the same extent
445 * leaf on disk. So, we don't want them spread
446 * all over the tree. Stop now if we've hit
447 * a head that was already in use
448 */
449 break;
450 }
451 node = rb_next(node);
452 }
453 if (count) {
454 return 0;
455 } else if (start) {
456 /*
457 * we've gone to the end of the rbtree without finding any
458 * clusters. start from the beginning and try again
459 */
460 start = 0; 421 start = 0;
461 node = rb_first(&delayed_refs->href_root); 422 loop = true;
462 goto again; 423 head = find_ref_head(&delayed_refs->href_root, start, NULL, 1);
424 if (!head)
425 return NULL;
426 } else if (!head && loop) {
427 return NULL;
463 } 428 }
464 return 1;
465}
466 429
467void btrfs_release_ref_cluster(struct list_head *cluster) 430 while (head->processing) {
468{ 431 struct rb_node *node;
469 struct list_head *pos, *q; 432
433 node = rb_next(&head->href_node);
434 if (!node) {
435 if (loop)
436 return NULL;
437 delayed_refs->run_delayed_start = 0;
438 start = 0;
439 loop = true;
440 goto again;
441 }
442 head = rb_entry(node, struct btrfs_delayed_ref_head,
443 href_node);
444 }
470 445
471 list_for_each_safe(pos, q, cluster) 446 head->processing = 1;
472 list_del_init(pos); 447 WARN_ON(delayed_refs->num_heads_ready == 0);
448 delayed_refs->num_heads_ready--;
449 delayed_refs->run_delayed_start = head->node.bytenr +
450 head->node.num_bytes;
451 return head;
473} 452}
474 453
475/* 454/*
@@ -483,6 +462,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster)
483static noinline void 462static noinline void
484update_existing_ref(struct btrfs_trans_handle *trans, 463update_existing_ref(struct btrfs_trans_handle *trans,
485 struct btrfs_delayed_ref_root *delayed_refs, 464 struct btrfs_delayed_ref_root *delayed_refs,
465 struct btrfs_delayed_ref_head *head,
486 struct btrfs_delayed_ref_node *existing, 466 struct btrfs_delayed_ref_node *existing,
487 struct btrfs_delayed_ref_node *update) 467 struct btrfs_delayed_ref_node *update)
488{ 468{
@@ -495,7 +475,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
495 */ 475 */
496 existing->ref_mod--; 476 existing->ref_mod--;
497 if (existing->ref_mod == 0) 477 if (existing->ref_mod == 0)
498 drop_delayed_ref(trans, delayed_refs, existing); 478 drop_delayed_ref(trans, delayed_refs, head, existing);
499 else 479 else
500 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || 480 WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
501 existing->type == BTRFS_SHARED_BLOCK_REF_KEY); 481 existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -565,9 +545,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
565 } 545 }
566 } 546 }
567 /* 547 /*
568 * update the reference mod on the head to reflect this new operation 548 * update the reference mod on the head to reflect this new operation,
549 * only need the lock for this case cause we could be processing it
550 * currently, for refs we just added we know we're a-ok.
569 */ 551 */
552 spin_lock(&existing_ref->lock);
570 existing->ref_mod += update->ref_mod; 553 existing->ref_mod += update->ref_mod;
554 spin_unlock(&existing_ref->lock);
571} 555}
572 556
573/* 557/*
@@ -575,13 +559,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
575 * this does all the dirty work in terms of maintaining the correct 559 * this does all the dirty work in terms of maintaining the correct
576 * overall modification count. 560 * overall modification count.
577 */ 561 */
578static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, 562static noinline struct btrfs_delayed_ref_head *
579 struct btrfs_trans_handle *trans, 563add_delayed_ref_head(struct btrfs_fs_info *fs_info,
580 struct btrfs_delayed_ref_node *ref, 564 struct btrfs_trans_handle *trans,
581 u64 bytenr, u64 num_bytes, 565 struct btrfs_delayed_ref_node *ref, u64 bytenr,
582 int action, int is_data) 566 u64 num_bytes, int action, int is_data)
583{ 567{
584 struct btrfs_delayed_ref_node *existing; 568 struct btrfs_delayed_ref_head *existing;
585 struct btrfs_delayed_ref_head *head_ref = NULL; 569 struct btrfs_delayed_ref_head *head_ref = NULL;
586 struct btrfs_delayed_ref_root *delayed_refs; 570 struct btrfs_delayed_ref_root *delayed_refs;
587 int count_mod = 1; 571 int count_mod = 1;
@@ -628,39 +612,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
628 head_ref = btrfs_delayed_node_to_head(ref); 612 head_ref = btrfs_delayed_node_to_head(ref);
629 head_ref->must_insert_reserved = must_insert_reserved; 613 head_ref->must_insert_reserved = must_insert_reserved;
630 head_ref->is_data = is_data; 614 head_ref->is_data = is_data;
615 head_ref->ref_root = RB_ROOT;
616 head_ref->processing = 0;
631 617
632 INIT_LIST_HEAD(&head_ref->cluster); 618 spin_lock_init(&head_ref->lock);
633 mutex_init(&head_ref->mutex); 619 mutex_init(&head_ref->mutex);
634 620
635 trace_add_delayed_ref_head(ref, head_ref, action); 621 trace_add_delayed_ref_head(ref, head_ref, action);
636 622
637 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 623 existing = htree_insert(&delayed_refs->href_root,
638 624 &head_ref->href_node);
639 if (existing) { 625 if (existing) {
640 update_existing_head_ref(existing, ref); 626 update_existing_head_ref(&existing->node, ref);
641 /* 627 /*
642 * we've updated the existing ref, free the newly 628 * we've updated the existing ref, free the newly
643 * allocated ref 629 * allocated ref
644 */ 630 */
645 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 631 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
632 head_ref = existing;
646 } else { 633 } else {
647 htree_insert(&delayed_refs->href_root, &head_ref->href_node);
648 delayed_refs->num_heads++; 634 delayed_refs->num_heads++;
649 delayed_refs->num_heads_ready++; 635 delayed_refs->num_heads_ready++;
650 delayed_refs->num_entries++; 636 atomic_inc(&delayed_refs->num_entries);
651 trans->delayed_ref_updates++; 637 trans->delayed_ref_updates++;
652 } 638 }
639 return head_ref;
653} 640}
654 641
655/* 642/*
656 * helper to insert a delayed tree ref into the rbtree. 643 * helper to insert a delayed tree ref into the rbtree.
657 */ 644 */
658static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, 645static noinline void
659 struct btrfs_trans_handle *trans, 646add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
660 struct btrfs_delayed_ref_node *ref, 647 struct btrfs_trans_handle *trans,
661 u64 bytenr, u64 num_bytes, u64 parent, 648 struct btrfs_delayed_ref_head *head_ref,
662 u64 ref_root, int level, int action, 649 struct btrfs_delayed_ref_node *ref, u64 bytenr,
663 int for_cow) 650 u64 num_bytes, u64 parent, u64 ref_root, int level,
651 int action, int for_cow)
664{ 652{
665 struct btrfs_delayed_ref_node *existing; 653 struct btrfs_delayed_ref_node *existing;
666 struct btrfs_delayed_tree_ref *full_ref; 654 struct btrfs_delayed_tree_ref *full_ref;
@@ -696,30 +684,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
696 684
697 trace_add_delayed_tree_ref(ref, full_ref, action); 685 trace_add_delayed_tree_ref(ref, full_ref, action);
698 686
699 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 687 spin_lock(&head_ref->lock);
700 688 existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
701 if (existing) { 689 if (existing) {
702 update_existing_ref(trans, delayed_refs, existing, ref); 690 update_existing_ref(trans, delayed_refs, head_ref, existing,
691 ref);
703 /* 692 /*
704 * we've updated the existing ref, free the newly 693 * we've updated the existing ref, free the newly
705 * allocated ref 694 * allocated ref
706 */ 695 */
707 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); 696 kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
708 } else { 697 } else {
709 delayed_refs->num_entries++; 698 atomic_inc(&delayed_refs->num_entries);
710 trans->delayed_ref_updates++; 699 trans->delayed_ref_updates++;
711 } 700 }
701 spin_unlock(&head_ref->lock);
712} 702}
713 703
714/* 704/*
715 * helper to insert a delayed data ref into the rbtree. 705 * helper to insert a delayed data ref into the rbtree.
716 */ 706 */
717static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, 707static noinline void
718 struct btrfs_trans_handle *trans, 708add_delayed_data_ref(struct btrfs_fs_info *fs_info,
719 struct btrfs_delayed_ref_node *ref, 709 struct btrfs_trans_handle *trans,
720 u64 bytenr, u64 num_bytes, u64 parent, 710 struct btrfs_delayed_ref_head *head_ref,
721 u64 ref_root, u64 owner, u64 offset, 711 struct btrfs_delayed_ref_node *ref, u64 bytenr,
722 int action, int for_cow) 712 u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
713 u64 offset, int action, int for_cow)
723{ 714{
724 struct btrfs_delayed_ref_node *existing; 715 struct btrfs_delayed_ref_node *existing;
725 struct btrfs_delayed_data_ref *full_ref; 716 struct btrfs_delayed_data_ref *full_ref;
@@ -757,19 +748,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
757 748
758 trace_add_delayed_data_ref(ref, full_ref, action); 749 trace_add_delayed_data_ref(ref, full_ref, action);
759 750
760 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 751 spin_lock(&head_ref->lock);
761 752 existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
762 if (existing) { 753 if (existing) {
763 update_existing_ref(trans, delayed_refs, existing, ref); 754 update_existing_ref(trans, delayed_refs, head_ref, existing,
755 ref);
764 /* 756 /*
765 * we've updated the existing ref, free the newly 757 * we've updated the existing ref, free the newly
766 * allocated ref 758 * allocated ref
767 */ 759 */
768 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); 760 kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
769 } else { 761 } else {
770 delayed_refs->num_entries++; 762 atomic_inc(&delayed_refs->num_entries);
771 trans->delayed_ref_updates++; 763 trans->delayed_ref_updates++;
772 } 764 }
765 spin_unlock(&head_ref->lock);
773} 766}
774 767
775/* 768/*
@@ -808,10 +801,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
808 * insert both the head node and the new ref without dropping 801 * insert both the head node and the new ref without dropping
809 * the spin lock 802 * the spin lock
810 */ 803 */
811 add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 804 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
812 num_bytes, action, 0); 805 bytenr, num_bytes, action, 0);
813 806
814 add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr, 807 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
815 num_bytes, parent, ref_root, level, action, 808 num_bytes, parent, ref_root, level, action,
816 for_cow); 809 for_cow);
817 spin_unlock(&delayed_refs->lock); 810 spin_unlock(&delayed_refs->lock);
@@ -856,10 +849,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
856 * insert both the head node and the new ref without dropping 849 * insert both the head node and the new ref without dropping
857 * the spin lock 850 * the spin lock
858 */ 851 */
859 add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, 852 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
860 num_bytes, action, 1); 853 bytenr, num_bytes, action, 1);
861 854
862 add_delayed_data_ref(fs_info, trans, &ref->node, bytenr, 855 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
863 num_bytes, parent, ref_root, owner, offset, 856 num_bytes, parent, ref_root, owner, offset,
864 action, for_cow); 857 action, for_cow);
865 spin_unlock(&delayed_refs->lock); 858 spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a54c9d47918f..4ba9b93022ff 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -81,7 +81,8 @@ struct btrfs_delayed_ref_head {
81 */ 81 */
82 struct mutex mutex; 82 struct mutex mutex;
83 83
84 struct list_head cluster; 84 spinlock_t lock;
85 struct rb_root ref_root;
85 86
86 struct rb_node href_node; 87 struct rb_node href_node;
87 88
@@ -100,6 +101,7 @@ struct btrfs_delayed_ref_head {
100 */ 101 */
101 unsigned int must_insert_reserved:1; 102 unsigned int must_insert_reserved:1;
102 unsigned int is_data:1; 103 unsigned int is_data:1;
104 unsigned int processing:1;
103}; 105};
104 106
105struct btrfs_delayed_tree_ref { 107struct btrfs_delayed_tree_ref {
@@ -118,8 +120,6 @@ struct btrfs_delayed_data_ref {
118}; 120};
119 121
120struct btrfs_delayed_ref_root { 122struct btrfs_delayed_ref_root {
121 struct rb_root root;
122
123 /* head ref rbtree */ 123 /* head ref rbtree */
124 struct rb_root href_root; 124 struct rb_root href_root;
125 125
@@ -129,7 +129,7 @@ struct btrfs_delayed_ref_root {
129 /* how many delayed ref updates we've queued, used by the 129 /* how many delayed ref updates we've queued, used by the
130 * throttling code 130 * throttling code
131 */ 131 */
132 unsigned long num_entries; 132 atomic_t num_entries;
133 133
134 /* total number of head nodes in tree */ 134 /* total number of head nodes in tree */
135 unsigned long num_heads; 135 unsigned long num_heads;
@@ -138,15 +138,6 @@ struct btrfs_delayed_ref_root {
138 unsigned long num_heads_ready; 138 unsigned long num_heads_ready;
139 139
140 /* 140 /*
141 * bumped when someone is making progress on the delayed
142 * refs, so that other procs know they are just adding to
143 * contention intead of helping
144 */
145 atomic_t procs_running_refs;
146 atomic_t ref_seq;
147 wait_queue_head_t wait;
148
149 /*
150 * set when the tree is flushing before a transaction commit, 141 * set when the tree is flushing before a transaction commit,
151 * used by the throttling code to decide if new updates need 142 * used by the throttling code to decide if new updates need
152 * to be run right away 143 * to be run right away
@@ -231,9 +222,9 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
231 mutex_unlock(&head->mutex); 222 mutex_unlock(&head->mutex);
232} 223}
233 224
234int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 225
235 struct list_head *cluster, u64 search_start); 226struct btrfs_delayed_ref_head *
236void btrfs_release_ref_cluster(struct list_head *cluster); 227btrfs_select_ref_head(struct btrfs_trans_handle *trans);
237 228
238int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 229int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
239 struct btrfs_delayed_ref_root *delayed_refs, 230 struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9850a51a3f19..ed23127a4b02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3801,58 +3801,55 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
3801 delayed_refs = &trans->delayed_refs; 3801 delayed_refs = &trans->delayed_refs;
3802 3802
3803 spin_lock(&delayed_refs->lock); 3803 spin_lock(&delayed_refs->lock);
3804 if (delayed_refs->num_entries == 0) { 3804 if (atomic_read(&delayed_refs->num_entries) == 0) {
3805 spin_unlock(&delayed_refs->lock); 3805 spin_unlock(&delayed_refs->lock);
3806 btrfs_info(root->fs_info, "delayed_refs has NO entry"); 3806 btrfs_info(root->fs_info, "delayed_refs has NO entry");
3807 return ret; 3807 return ret;
3808 } 3808 }
3809 3809
3810 while ((node = rb_first(&delayed_refs->root)) != NULL) { 3810 while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
3811 struct btrfs_delayed_ref_head *head = NULL; 3811 struct btrfs_delayed_ref_head *head;
3812 bool pin_bytes = false; 3812 bool pin_bytes = false;
3813 3813
3814 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 3814 head = rb_entry(node, struct btrfs_delayed_ref_head,
3815 atomic_set(&ref->refs, 1); 3815 href_node);
3816 if (btrfs_delayed_ref_is_head(ref)) { 3816 if (!mutex_trylock(&head->mutex)) {
3817 atomic_inc(&head->node.refs);
3818 spin_unlock(&delayed_refs->lock);
3817 3819
3818 head = btrfs_delayed_node_to_head(ref); 3820 mutex_lock(&head->mutex);
3819 if (!mutex_trylock(&head->mutex)) {
3820 atomic_inc(&ref->refs);
3821 spin_unlock(&delayed_refs->lock);
3822
3823 /* Need to wait for the delayed ref to run */
3824 mutex_lock(&head->mutex);
3825 mutex_unlock(&head->mutex);
3826 btrfs_put_delayed_ref(ref);
3827
3828 spin_lock(&delayed_refs->lock);
3829 continue;
3830 }
3831
3832 if (head->must_insert_reserved)
3833 pin_bytes = true;
3834 btrfs_free_delayed_extent_op(head->extent_op);
3835 delayed_refs->num_heads--;
3836 if (list_empty(&head->cluster))
3837 delayed_refs->num_heads_ready--;
3838 list_del_init(&head->cluster);
3839 }
3840
3841 ref->in_tree = 0;
3842 rb_erase(&ref->rb_node, &delayed_refs->root);
3843 if (head)
3844 rb_erase(&head->href_node, &delayed_refs->href_root);
3845
3846 delayed_refs->num_entries--;
3847 spin_unlock(&delayed_refs->lock);
3848 if (head) {
3849 if (pin_bytes)
3850 btrfs_pin_extent(root, ref->bytenr,
3851 ref->num_bytes, 1);
3852 mutex_unlock(&head->mutex); 3821 mutex_unlock(&head->mutex);
3822 btrfs_put_delayed_ref(&head->node);
3823 spin_lock(&delayed_refs->lock);
3824 continue;
3825 }
3826 spin_lock(&head->lock);
3827 while ((node = rb_first(&head->ref_root)) != NULL) {
3828 ref = rb_entry(node, struct btrfs_delayed_ref_node,
3829 rb_node);
3830 ref->in_tree = 0;
3831 rb_erase(&ref->rb_node, &head->ref_root);
3832 atomic_dec(&delayed_refs->num_entries);
3833 btrfs_put_delayed_ref(ref);
3834 cond_resched_lock(&head->lock);
3853 } 3835 }
3854 btrfs_put_delayed_ref(ref); 3836 if (head->must_insert_reserved)
3837 pin_bytes = true;
3838 btrfs_free_delayed_extent_op(head->extent_op);
3839 delayed_refs->num_heads--;
3840 if (head->processing == 0)
3841 delayed_refs->num_heads_ready--;
3842 atomic_dec(&delayed_refs->num_entries);
3843 head->node.in_tree = 0;
3844 rb_erase(&head->href_node, &delayed_refs->href_root);
3845 spin_unlock(&head->lock);
3846 spin_unlock(&delayed_refs->lock);
3847 mutex_unlock(&head->mutex);
3855 3848
3849 if (pin_bytes)
3850 btrfs_pin_extent(root, head->node.bytenr,
3851 head->node.num_bytes, 1);
3852 btrfs_put_delayed_ref(&head->node);
3856 cond_resched(); 3853 cond_resched();
3857 spin_lock(&delayed_refs->lock); 3854 spin_lock(&delayed_refs->lock);
3858 } 3855 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 77acc0873895..c77156c77de7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -857,12 +857,14 @@ again:
857 btrfs_put_delayed_ref(&head->node); 857 btrfs_put_delayed_ref(&head->node);
858 goto search_again; 858 goto search_again;
859 } 859 }
860 spin_lock(&head->lock);
860 if (head->extent_op && head->extent_op->update_flags) 861 if (head->extent_op && head->extent_op->update_flags)
861 extent_flags |= head->extent_op->flags_to_set; 862 extent_flags |= head->extent_op->flags_to_set;
862 else 863 else
863 BUG_ON(num_refs == 0); 864 BUG_ON(num_refs == 0);
864 865
865 num_refs += head->node.ref_mod; 866 num_refs += head->node.ref_mod;
867 spin_unlock(&head->lock);
866 mutex_unlock(&head->mutex); 868 mutex_unlock(&head->mutex);
867 } 869 }
868 spin_unlock(&delayed_refs->lock); 870 spin_unlock(&delayed_refs->lock);
@@ -2287,40 +2289,33 @@ static noinline struct btrfs_delayed_ref_node *
2287select_delayed_ref(struct btrfs_delayed_ref_head *head) 2289select_delayed_ref(struct btrfs_delayed_ref_head *head)
2288{ 2290{
2289 struct rb_node *node; 2291 struct rb_node *node;
2290 struct btrfs_delayed_ref_node *ref; 2292 struct btrfs_delayed_ref_node *ref, *last = NULL;;
2291 int action = BTRFS_ADD_DELAYED_REF; 2293
2292again:
2293 /* 2294 /*
2294 * select delayed ref of type BTRFS_ADD_DELAYED_REF first. 2295 * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2295 * this prevents ref count from going down to zero when 2296 * this prevents ref count from going down to zero when
2296 * there still are pending delayed ref. 2297 * there still are pending delayed ref.
2297 */ 2298 */
2298 node = rb_prev(&head->node.rb_node); 2299 node = rb_first(&head->ref_root);
2299 while (1) { 2300 while (node) {
2300 if (!node)
2301 break;
2302 ref = rb_entry(node, struct btrfs_delayed_ref_node, 2301 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2303 rb_node); 2302 rb_node);
2304 if (ref->bytenr != head->node.bytenr) 2303 if (ref->action == BTRFS_ADD_DELAYED_REF)
2305 break;
2306 if (ref->action == action)
2307 return ref; 2304 return ref;
2308 node = rb_prev(node); 2305 else if (last == NULL)
2309 } 2306 last = ref;
2310 if (action == BTRFS_ADD_DELAYED_REF) { 2307 node = rb_next(node);
2311 action = BTRFS_DROP_DELAYED_REF;
2312 goto again;
2313 } 2308 }
2314 return NULL; 2309 return last;
2315} 2310}
2316 2311
2317/* 2312/*
2318 * Returns 0 on success or if called with an already aborted transaction. 2313 * Returns 0 on success or if called with an already aborted transaction.
2319 * Returns -ENOMEM or -EIO on failure and will abort the transaction. 2314 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2320 */ 2315 */
2321static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, 2316static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2322 struct btrfs_root *root, 2317 struct btrfs_root *root,
2323 struct list_head *cluster) 2318 unsigned long nr)
2324{ 2319{
2325 struct btrfs_delayed_ref_root *delayed_refs; 2320 struct btrfs_delayed_ref_root *delayed_refs;
2326 struct btrfs_delayed_ref_node *ref; 2321 struct btrfs_delayed_ref_node *ref;
@@ -2328,23 +2323,26 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2328 struct btrfs_delayed_extent_op *extent_op; 2323 struct btrfs_delayed_extent_op *extent_op;
2329 struct btrfs_fs_info *fs_info = root->fs_info; 2324 struct btrfs_fs_info *fs_info = root->fs_info;
2330 int ret; 2325 int ret;
2331 int count = 0; 2326 unsigned long count = 0;
2332 int must_insert_reserved = 0; 2327 int must_insert_reserved = 0;
2333 2328
2334 delayed_refs = &trans->transaction->delayed_refs; 2329 delayed_refs = &trans->transaction->delayed_refs;
2335 while (1) { 2330 while (1) {
2336 if (!locked_ref) { 2331 if (!locked_ref) {
2337 /* pick a new head ref from the cluster list */ 2332 if (count >= nr)
2338 if (list_empty(cluster))
2339 break; 2333 break;
2340 2334
2341 locked_ref = list_entry(cluster->next, 2335 spin_lock(&delayed_refs->lock);
2342 struct btrfs_delayed_ref_head, cluster); 2336 locked_ref = btrfs_select_ref_head(trans);
2337 if (!locked_ref) {
2338 spin_unlock(&delayed_refs->lock);
2339 break;
2340 }
2343 2341
2344 /* grab the lock that says we are going to process 2342 /* grab the lock that says we are going to process
2345 * all the refs for this head */ 2343 * all the refs for this head */
2346 ret = btrfs_delayed_ref_lock(trans, locked_ref); 2344 ret = btrfs_delayed_ref_lock(trans, locked_ref);
2347 2345 spin_unlock(&delayed_refs->lock);
2348 /* 2346 /*
2349 * we may have dropped the spin lock to get the head 2347 * we may have dropped the spin lock to get the head
2350 * mutex lock, and that might have given someone else 2348 * mutex lock, and that might have given someone else
@@ -2365,6 +2363,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2365 * finish. If we merged anything we need to re-loop so we can 2363 * finish. If we merged anything we need to re-loop so we can
2366 * get a good ref. 2364 * get a good ref.
2367 */ 2365 */
2366 spin_lock(&locked_ref->lock);
2368 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, 2367 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2369 locked_ref); 2368 locked_ref);
2370 2369
@@ -2376,17 +2375,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2376 2375
2377 if (ref && ref->seq && 2376 if (ref && ref->seq &&
2378 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { 2377 btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2379 /* 2378 spin_unlock(&locked_ref->lock);
2380 * there are still refs with lower seq numbers in the
2381 * process of being added. Don't run this ref yet.
2382 */
2383 list_del_init(&locked_ref->cluster);
2384 btrfs_delayed_ref_unlock(locked_ref); 2379 btrfs_delayed_ref_unlock(locked_ref);
2385 locked_ref = NULL; 2380 spin_lock(&delayed_refs->lock);
2381 locked_ref->processing = 0;
2386 delayed_refs->num_heads_ready++; 2382 delayed_refs->num_heads_ready++;
2387 spin_unlock(&delayed_refs->lock); 2383 spin_unlock(&delayed_refs->lock);
2384 locked_ref = NULL;
2388 cond_resched(); 2385 cond_resched();
2389 spin_lock(&delayed_refs->lock);
2390 continue; 2386 continue;
2391 } 2387 }
2392 2388
@@ -2401,6 +2397,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2401 locked_ref->extent_op = NULL; 2397 locked_ref->extent_op = NULL;
2402 2398
2403 if (!ref) { 2399 if (!ref) {
2400
2401
2404 /* All delayed refs have been processed, Go ahead 2402 /* All delayed refs have been processed, Go ahead
2405 * and send the head node to run_one_delayed_ref, 2403 * and send the head node to run_one_delayed_ref,
2406 * so that any accounting fixes can happen 2404 * so that any accounting fixes can happen
@@ -2413,8 +2411,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2413 } 2411 }
2414 2412
2415 if (extent_op) { 2413 if (extent_op) {
2416 spin_unlock(&delayed_refs->lock); 2414 spin_unlock(&locked_ref->lock);
2417
2418 ret = run_delayed_extent_op(trans, root, 2415 ret = run_delayed_extent_op(trans, root,
2419 ref, extent_op); 2416 ref, extent_op);
2420 btrfs_free_delayed_extent_op(extent_op); 2417 btrfs_free_delayed_extent_op(extent_op);
@@ -2428,23 +2425,38 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2428 */ 2425 */
2429 if (must_insert_reserved) 2426 if (must_insert_reserved)
2430 locked_ref->must_insert_reserved = 1; 2427 locked_ref->must_insert_reserved = 1;
2428 locked_ref->processing = 0;
2431 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); 2429 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2432 spin_lock(&delayed_refs->lock);
2433 btrfs_delayed_ref_unlock(locked_ref); 2430 btrfs_delayed_ref_unlock(locked_ref);
2434 return ret; 2431 return ret;
2435 } 2432 }
2436 2433 continue;
2437 goto next;
2438 } 2434 }
2439 }
2440 2435
2441 ref->in_tree = 0; 2436 /*
2442 rb_erase(&ref->rb_node, &delayed_refs->root); 2437 * Need to drop our head ref lock and re-aqcuire the
2443 if (btrfs_delayed_ref_is_head(ref)) { 2438 * delayed ref lock and then re-check to make sure
2439 * nobody got added.
2440 */
2441 spin_unlock(&locked_ref->lock);
2442 spin_lock(&delayed_refs->lock);
2443 spin_lock(&locked_ref->lock);
2444 if (rb_first(&locked_ref->ref_root)) {
2445 spin_unlock(&locked_ref->lock);
2446 spin_unlock(&delayed_refs->lock);
2447 continue;
2448 }
2449 ref->in_tree = 0;
2450 delayed_refs->num_heads--;
2444 rb_erase(&locked_ref->href_node, 2451 rb_erase(&locked_ref->href_node,
2445 &delayed_refs->href_root); 2452 &delayed_refs->href_root);
2453 spin_unlock(&delayed_refs->lock);
2454 } else {
2455 ref->in_tree = 0;
2456 rb_erase(&ref->rb_node, &locked_ref->ref_root);
2446 } 2457 }
2447 delayed_refs->num_entries--; 2458 atomic_dec(&delayed_refs->num_entries);
2459
2448 if (!btrfs_delayed_ref_is_head(ref)) { 2460 if (!btrfs_delayed_ref_is_head(ref)) {
2449 /* 2461 /*
2450 * when we play the delayed ref, also correct the 2462 * when we play the delayed ref, also correct the
@@ -2461,20 +2473,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2461 default: 2473 default:
2462 WARN_ON(1); 2474 WARN_ON(1);
2463 } 2475 }
2464 } else {
2465 list_del_init(&locked_ref->cluster);
2466 } 2476 }
2467 spin_unlock(&delayed_refs->lock); 2477 spin_unlock(&locked_ref->lock);
2468 2478
2469 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2479 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2470 must_insert_reserved); 2480 must_insert_reserved);
2471 2481
2472 btrfs_free_delayed_extent_op(extent_op); 2482 btrfs_free_delayed_extent_op(extent_op);
2473 if (ret) { 2483 if (ret) {
2484 locked_ref->processing = 0;
2474 btrfs_delayed_ref_unlock(locked_ref); 2485 btrfs_delayed_ref_unlock(locked_ref);
2475 btrfs_put_delayed_ref(ref); 2486 btrfs_put_delayed_ref(ref);
2476 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); 2487 btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2477 spin_lock(&delayed_refs->lock);
2478 return ret; 2488 return ret;
2479 } 2489 }
2480 2490
@@ -2490,11 +2500,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2490 } 2500 }
2491 btrfs_put_delayed_ref(ref); 2501 btrfs_put_delayed_ref(ref);
2492 count++; 2502 count++;
2493next:
2494 cond_resched(); 2503 cond_resched();
2495 spin_lock(&delayed_refs->lock);
2496 } 2504 }
2497 return count; 2505 return 0;
2498} 2506}
2499 2507
2500#ifdef SCRAMBLE_DELAYED_REFS 2508#ifdef SCRAMBLE_DELAYED_REFS
@@ -2576,16 +2584,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2576 return ret; 2584 return ret;
2577} 2585}
2578 2586
2579static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2580 int count)
2581{
2582 int val = atomic_read(&delayed_refs->ref_seq);
2583
2584 if (val < seq || val >= seq + count)
2585 return 1;
2586 return 0;
2587}
2588
2589static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) 2587static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2590{ 2588{
2591 u64 num_bytes; 2589 u64 num_bytes;
@@ -2647,12 +2645,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2647 struct rb_node *node; 2645 struct rb_node *node;
2648 struct btrfs_delayed_ref_root *delayed_refs; 2646 struct btrfs_delayed_ref_root *delayed_refs;
2649 struct btrfs_delayed_ref_head *head; 2647 struct btrfs_delayed_ref_head *head;
2650 struct list_head cluster;
2651 int ret; 2648 int ret;
2652 u64 delayed_start;
2653 int run_all = count == (unsigned long)-1; 2649 int run_all = count == (unsigned long)-1;
2654 int run_most = 0; 2650 int run_most = 0;
2655 int loops;
2656 2651
2657 /* We'll clean this up in btrfs_cleanup_transaction */ 2652 /* We'll clean this up in btrfs_cleanup_transaction */
2658 if (trans->aborted) 2653 if (trans->aborted)
@@ -2664,121 +2659,31 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2664 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); 2659 btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2665 2660
2666 delayed_refs = &trans->transaction->delayed_refs; 2661 delayed_refs = &trans->transaction->delayed_refs;
2667 INIT_LIST_HEAD(&cluster);
2668 if (count == 0) { 2662 if (count == 0) {
2669 count = delayed_refs->num_entries * 2; 2663 count = atomic_read(&delayed_refs->num_entries) * 2;
2670 run_most = 1; 2664 run_most = 1;
2671 } 2665 }
2672 2666
2673 if (!run_all && !run_most) {
2674 int old;
2675 int seq = atomic_read(&delayed_refs->ref_seq);
2676
2677progress:
2678 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2679 if (old) {
2680 DEFINE_WAIT(__wait);
2681 if (delayed_refs->flushing ||
2682 !btrfs_should_throttle_delayed_refs(trans, root))
2683 return 0;
2684
2685 prepare_to_wait(&delayed_refs->wait, &__wait,
2686 TASK_UNINTERRUPTIBLE);
2687
2688 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2689 if (old) {
2690 schedule();
2691 finish_wait(&delayed_refs->wait, &__wait);
2692
2693 if (!refs_newer(delayed_refs, seq, 256))
2694 goto progress;
2695 else
2696 return 0;
2697 } else {
2698 finish_wait(&delayed_refs->wait, &__wait);
2699 goto again;
2700 }
2701 }
2702
2703 } else {
2704 atomic_inc(&delayed_refs->procs_running_refs);
2705 }
2706
2707again: 2667again:
2708 loops = 0;
2709 spin_lock(&delayed_refs->lock);
2710
2711#ifdef SCRAMBLE_DELAYED_REFS 2668#ifdef SCRAMBLE_DELAYED_REFS
2712 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); 2669 delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2713#endif 2670#endif
2714 2671 ret = __btrfs_run_delayed_refs(trans, root, count);
2715 while (1) { 2672 if (ret < 0) {
2716 if (!(run_all || run_most) && 2673 btrfs_abort_transaction(trans, root, ret);
2717 !btrfs_should_throttle_delayed_refs(trans, root)) 2674 return ret;
2718 break;
2719
2720 /*
2721 * go find something we can process in the rbtree. We start at
2722 * the beginning of the tree, and then build a cluster
2723 * of refs to process starting at the first one we are able to
2724 * lock
2725 */
2726 delayed_start = delayed_refs->run_delayed_start;
2727 ret = btrfs_find_ref_cluster(trans, &cluster,
2728 delayed_refs->run_delayed_start);
2729 if (ret)
2730 break;
2731
2732 ret = run_clustered_refs(trans, root, &cluster);
2733 if (ret < 0) {
2734 btrfs_release_ref_cluster(&cluster);
2735 spin_unlock(&delayed_refs->lock);
2736 btrfs_abort_transaction(trans, root, ret);
2737 atomic_dec(&delayed_refs->procs_running_refs);
2738 wake_up(&delayed_refs->wait);
2739 return ret;
2740 }
2741
2742 atomic_add(ret, &delayed_refs->ref_seq);
2743
2744 count -= min_t(unsigned long, ret, count);
2745
2746 if (count == 0)
2747 break;
2748
2749 if (delayed_start >= delayed_refs->run_delayed_start) {
2750 if (loops == 0) {
2751 /*
2752 * btrfs_find_ref_cluster looped. let's do one
2753 * more cycle. if we don't run any delayed ref
2754 * during that cycle (because we can't because
2755 * all of them are blocked), bail out.
2756 */
2757 loops = 1;
2758 } else {
2759 /*
2760 * no runnable refs left, stop trying
2761 */
2762 BUG_ON(run_all);
2763 break;
2764 }
2765 }
2766 if (ret) {
2767 /* refs were run, let's reset staleness detection */
2768 loops = 0;
2769 }
2770 } 2675 }
2771 2676
2772 if (run_all) { 2677 if (run_all) {
2773 if (!list_empty(&trans->new_bgs)) { 2678 if (!list_empty(&trans->new_bgs))
2774 spin_unlock(&delayed_refs->lock);
2775 btrfs_create_pending_block_groups(trans, root); 2679 btrfs_create_pending_block_groups(trans, root);
2776 spin_lock(&delayed_refs->lock);
2777 }
2778 2680
2681 spin_lock(&delayed_refs->lock);
2779 node = rb_first(&delayed_refs->href_root); 2682 node = rb_first(&delayed_refs->href_root);
2780 if (!node) 2683 if (!node) {
2684 spin_unlock(&delayed_refs->lock);
2781 goto out; 2685 goto out;
2686 }
2782 count = (unsigned long)-1; 2687 count = (unsigned long)-1;
2783 2688
2784 while (node) { 2689 while (node) {
@@ -2807,16 +2712,10 @@ again:
2807 node = rb_next(node); 2712 node = rb_next(node);
2808 } 2713 }
2809 spin_unlock(&delayed_refs->lock); 2714 spin_unlock(&delayed_refs->lock);
2810 schedule_timeout(1); 2715 cond_resched();
2811 goto again; 2716 goto again;
2812 } 2717 }
2813out: 2718out:
2814 atomic_dec(&delayed_refs->procs_running_refs);
2815 smp_mb();
2816 if (waitqueue_active(&delayed_refs->wait))
2817 wake_up(&delayed_refs->wait);
2818
2819 spin_unlock(&delayed_refs->lock);
2820 assert_qgroups_uptodate(trans); 2719 assert_qgroups_uptodate(trans);
2821 return 0; 2720 return 0;
2822} 2721}
@@ -2858,12 +2757,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2858 struct rb_node *node; 2757 struct rb_node *node;
2859 int ret = 0; 2758 int ret = 0;
2860 2759
2861 ret = -ENOENT;
2862 delayed_refs = &trans->transaction->delayed_refs; 2760 delayed_refs = &trans->transaction->delayed_refs;
2863 spin_lock(&delayed_refs->lock); 2761 spin_lock(&delayed_refs->lock);
2864 head = btrfs_find_delayed_ref_head(trans, bytenr); 2762 head = btrfs_find_delayed_ref_head(trans, bytenr);
2865 if (!head) 2763 if (!head) {
2866 goto out; 2764 spin_unlock(&delayed_refs->lock);
2765 return 0;
2766 }
2867 2767
2868 if (!mutex_trylock(&head->mutex)) { 2768 if (!mutex_trylock(&head->mutex)) {
2869 atomic_inc(&head->node.refs); 2769 atomic_inc(&head->node.refs);
@@ -2880,40 +2780,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2880 btrfs_put_delayed_ref(&head->node); 2780 btrfs_put_delayed_ref(&head->node);
2881 return -EAGAIN; 2781 return -EAGAIN;
2882 } 2782 }
2783 spin_unlock(&delayed_refs->lock);
2883 2784
2884 node = rb_prev(&head->node.rb_node); 2785 spin_lock(&head->lock);
2885 if (!node) 2786 node = rb_first(&head->ref_root);
2886 goto out_unlock; 2787 while (node) {
2887 2788 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2888 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2789 node = rb_next(node);
2889
2890 if (ref->bytenr != bytenr)
2891 goto out_unlock;
2892 2790
2893 ret = 1; 2791 /* If it's a shared ref we know a cross reference exists */
2894 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) 2792 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2895 goto out_unlock; 2793 ret = 1;
2794 break;
2795 }
2896 2796
2897 data_ref = btrfs_delayed_node_to_data_ref(ref); 2797 data_ref = btrfs_delayed_node_to_data_ref(ref);
2898 2798
2899 node = rb_prev(node); 2799 /*
2900 if (node) { 2800 * If our ref doesn't match the one we're currently looking at
2901 int seq = ref->seq; 2801 * then we have a cross reference.
2902 2802 */
2903 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); 2803 if (data_ref->root != root->root_key.objectid ||
2904 if (ref->bytenr == bytenr && ref->seq == seq) 2804 data_ref->objectid != objectid ||
2905 goto out_unlock; 2805 data_ref->offset != offset) {
2806 ret = 1;
2807 break;
2808 }
2906 } 2809 }
2907 2810 spin_unlock(&head->lock);
2908 if (data_ref->root != root->root_key.objectid ||
2909 data_ref->objectid != objectid || data_ref->offset != offset)
2910 goto out_unlock;
2911
2912 ret = 0;
2913out_unlock:
2914 mutex_unlock(&head->mutex); 2811 mutex_unlock(&head->mutex);
2915out:
2916 spin_unlock(&delayed_refs->lock);
2917 return ret; 2812 return ret;
2918} 2813}
2919 2814
@@ -5953,8 +5848,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5953{ 5848{
5954 struct btrfs_delayed_ref_head *head; 5849 struct btrfs_delayed_ref_head *head;
5955 struct btrfs_delayed_ref_root *delayed_refs; 5850 struct btrfs_delayed_ref_root *delayed_refs;
5956 struct btrfs_delayed_ref_node *ref;
5957 struct rb_node *node;
5958 int ret = 0; 5851 int ret = 0;
5959 5852
5960 delayed_refs = &trans->transaction->delayed_refs; 5853 delayed_refs = &trans->transaction->delayed_refs;
@@ -5963,14 +5856,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5963 if (!head) 5856 if (!head)
5964 goto out; 5857 goto out;
5965 5858
5966 node = rb_prev(&head->node.rb_node); 5859 spin_lock(&head->lock);
5967 if (!node) 5860 if (rb_first(&head->ref_root))
5968 goto out;
5969
5970 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5971
5972 /* there are still entries for this ref, we can't drop it */
5973 if (ref->bytenr == bytenr)
5974 goto out; 5861 goto out;
5975 5862
5976 if (head->extent_op) { 5863 if (head->extent_op) {
@@ -5992,20 +5879,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5992 * ahead and process it. 5879 * ahead and process it.
5993 */ 5880 */
5994 head->node.in_tree = 0; 5881 head->node.in_tree = 0;
5995 rb_erase(&head->node.rb_node, &delayed_refs->root);
5996 rb_erase(&head->href_node, &delayed_refs->href_root); 5882 rb_erase(&head->href_node, &delayed_refs->href_root);
5997 5883
5998 delayed_refs->num_entries--; 5884 atomic_dec(&delayed_refs->num_entries);
5999 5885
6000 /* 5886 /*
6001 * we don't take a ref on the node because we're removing it from the 5887 * we don't take a ref on the node because we're removing it from the
6002 * tree, so we just steal the ref the tree was holding. 5888 * tree, so we just steal the ref the tree was holding.
6003 */ 5889 */
6004 delayed_refs->num_heads--; 5890 delayed_refs->num_heads--;
6005 if (list_empty(&head->cluster)) 5891 if (head->processing == 0)
6006 delayed_refs->num_heads_ready--; 5892 delayed_refs->num_heads_ready--;
6007 5893 head->processing = 0;
6008 list_del_init(&head->cluster); 5894 spin_unlock(&head->lock);
6009 spin_unlock(&delayed_refs->lock); 5895 spin_unlock(&delayed_refs->lock);
6010 5896
6011 BUG_ON(head->extent_op); 5897 BUG_ON(head->extent_op);
@@ -6016,6 +5902,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
6016 btrfs_put_delayed_ref(&head->node); 5902 btrfs_put_delayed_ref(&head->node);
6017 return ret; 5903 return ret;
6018out: 5904out:
5905 spin_unlock(&head->lock);
6019 spin_unlock(&delayed_refs->lock); 5906 spin_unlock(&delayed_refs->lock);
6020 return 0; 5907 return 0;
6021} 5908}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b16352ce0f73..fd1446496fe8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -62,7 +62,6 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
62 WARN_ON(atomic_read(&transaction->use_count) == 0); 62 WARN_ON(atomic_read(&transaction->use_count) == 0);
63 if (atomic_dec_and_test(&transaction->use_count)) { 63 if (atomic_dec_and_test(&transaction->use_count)) {
64 BUG_ON(!list_empty(&transaction->list)); 64 BUG_ON(!list_empty(&transaction->list));
65 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.root));
66 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); 65 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
67 while (!list_empty(&transaction->pending_chunks)) { 66 while (!list_empty(&transaction->pending_chunks)) {
68 struct extent_map *em; 67 struct extent_map *em;
@@ -184,9 +183,8 @@ loop:
184 atomic_set(&cur_trans->use_count, 2); 183 atomic_set(&cur_trans->use_count, 2);
185 cur_trans->start_time = get_seconds(); 184 cur_trans->start_time = get_seconds();
186 185
187 cur_trans->delayed_refs.root = RB_ROOT;
188 cur_trans->delayed_refs.href_root = RB_ROOT; 186 cur_trans->delayed_refs.href_root = RB_ROOT;
189 cur_trans->delayed_refs.num_entries = 0; 187 atomic_set(&cur_trans->delayed_refs.num_entries, 0);
190 cur_trans->delayed_refs.num_heads_ready = 0; 188 cur_trans->delayed_refs.num_heads_ready = 0;
191 cur_trans->delayed_refs.num_heads = 0; 189 cur_trans->delayed_refs.num_heads = 0;
192 cur_trans->delayed_refs.flushing = 0; 190 cur_trans->delayed_refs.flushing = 0;
@@ -206,9 +204,6 @@ loop:
206 atomic64_set(&fs_info->tree_mod_seq, 0); 204 atomic64_set(&fs_info->tree_mod_seq, 0);
207 205
208 spin_lock_init(&cur_trans->delayed_refs.lock); 206 spin_lock_init(&cur_trans->delayed_refs.lock);
209 atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
210 atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
211 init_waitqueue_head(&cur_trans->delayed_refs.wait);
212 207
213 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 208 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
214 INIT_LIST_HEAD(&cur_trans->ordered_operations); 209 INIT_LIST_HEAD(&cur_trans->ordered_operations);