aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-04-24 15:06:16 -0400
committerAlex Elder <aelder@sgi.com>2011-04-28 14:18:04 -0400
commit97d3ac75e5e0ebf7ca38ae74cebd201c09b97ab2 (patch)
treee08af7a4bbb93c22dfeb1bcb2d3caf83aef717c9 /fs/xfs/xfs_alloc.c
parente26f0501cf743a4289603501413f97ffcb4612f2 (diff)
xfs: exact busy extent tracking
Update the extent tree in case we have to reuse a busy extent, so that it always is kept uptodate. This is done by replacing the busy list searches with a new xfs_alloc_busy_reuse helper, which updates the busy extent tree in case of a reuse. This allows us to allow reusing metadata extents unconditionally, and thus avoid log forces especially for allocation btree blocks. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c373
1 files changed, 218 insertions, 155 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index ff54ef428348..53157d4d5e8b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1396,8 +1396,9 @@ xfs_alloc_ag_vextent_small(
1396 if (error) 1396 if (error)
1397 goto error0; 1397 goto error0;
1398 if (fbno != NULLAGBLOCK) { 1398 if (fbno != NULLAGBLOCK) {
1399 if (xfs_alloc_busy_search(args->mp, args->agno, fbno, 1)) 1399 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
1400 xfs_trans_set_sync(args->tp); 1400 args->userdata);
1401
1401 if (args->userdata) { 1402 if (args->userdata) {
1402 xfs_buf_t *bp; 1403 xfs_buf_t *bp;
1403 1404
@@ -2475,100 +2476,6 @@ error0:
2475 return error; 2476 return error;
2476} 2477}
2477 2478
2478
2479/*
2480 * AG Busy list management
2481 * The busy list contains block ranges that have been freed but whose
2482 * transactions have not yet hit disk. If any block listed in a busy
2483 * list is reused, the transaction that freed it must be forced to disk
2484 * before continuing to use the block.
2485 *
2486 * xfs_alloc_busy_insert - add to the per-ag busy list
2487 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2488 * xfs_alloc_busy_search - search for a busy extent
2489 */
2490
2491/*
2492 * Insert a new extent into the busy tree.
2493 *
2494 * The busy extent tree is indexed by the start block of the busy extent.
2495 * there can be multiple overlapping ranges in the busy extent tree but only
2496 * ever one entry at a given start block. The reason for this is that
2497 * multi-block extents can be freed, then smaller chunks of that extent
2498 * allocated and freed again before the first transaction commit is on disk.
2499 * If the exact same start block is freed a second time, we have to wait for
2500 * that busy extent to pass out of the tree before the new extent is inserted.
2501 * There are two main cases we have to handle here.
2502 *
2503 * The first case is a transaction that triggers a "free - allocate - free"
2504 * cycle. This can occur during btree manipulations as a btree block is freed
2505 * to the freelist, then allocated from the free list, then freed again. In
2506 * this case, the second extxpnet free is what triggers the duplicate and as
2507 * such the transaction IDs should match. Because the extent was allocated in
2508 * this transaction, the transaction must be marked as synchronous. This is
2509 * true for all cases where the free/alloc/free occurs in the one transaction,
2510 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2511 * This serves to catch violations of the second case quite effectively.
2512 *
2513 * The second case is where the free/alloc/free occur in different
2514 * transactions. In this case, the thread freeing the extent the second time
2515 * can't mark the extent busy immediately because it is already tracked in a
2516 * transaction that may be committing. When the log commit for the existing
2517 * busy extent completes, the busy extent will be removed from the tree. If we
2518 * allow the second busy insert to continue using that busy extent structure,
2519 * it can be freed before this transaction is safely in the log. Hence our
2520 * only option in this case is to force the log to remove the existing busy
2521 * extent from the list before we insert the new one with the current
2522 * transaction ID.
2523 *
2524 * The problem we are trying to avoid in the free-alloc-free in separate
2525 * transactions is most easily described with a timeline:
2526 *
2527 * Thread 1 Thread 2 Thread 3 xfslogd
2528 * xact alloc
2529 * free X
2530 * mark busy
2531 * commit xact
2532 * free xact
2533 * xact alloc
2534 * alloc X
2535 * busy search
2536 * mark xact sync
2537 * commit xact
2538 * free xact
2539 * force log
2540 * checkpoint starts
2541 * ....
2542 * xact alloc
2543 * free X
2544 * mark busy
2545 * finds match
2546 * *** KABOOM! ***
2547 * ....
2548 * log IO completes
2549 * unbusy X
2550 * checkpoint completes
2551 *
2552 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2553 * the checkpoint completes, and the busy extent it matched will have been
2554 * removed from the tree when it is woken. Hence it can then continue safely.
2555 *
2556 * However, to ensure this matching process is robust, we need to use the
2557 * transaction ID for identifying transaction, as delayed logging results in
2558 * the busy extent and transaction lifecycles being different. i.e. the busy
2559 * extent is active for a lot longer than the transaction. Hence the
2560 * transaction structure can be freed and reallocated, then mark the same
2561 * extent busy again in the new transaction. In this case the new transaction
2562 * will have a different tid but can have the same address, and hence we need
2563 * to check against the tid.
2564 *
2565 * Future: for delayed logging, we could avoid the log force if the extent was
2566 * first freed in the current checkpoint sequence. This, however, requires the
2567 * ability to pin the current checkpoint in memory until this transaction
2568 * commits to ensure that both the original free and the current one combine
2569 * logically into the one checkpoint. If the checkpoint sequences are
2570 * different, however, we still need to wait on a log force.
2571 */
2572void 2479void
2573xfs_alloc_busy_insert( 2480xfs_alloc_busy_insert(
2574 struct xfs_trans *tp, 2481 struct xfs_trans *tp,
@@ -2580,9 +2487,7 @@ xfs_alloc_busy_insert(
2580 struct xfs_busy_extent *busyp; 2487 struct xfs_busy_extent *busyp;
2581 struct xfs_perag *pag; 2488 struct xfs_perag *pag;
2582 struct rb_node **rbp; 2489 struct rb_node **rbp;
2583 struct rb_node *parent; 2490 struct rb_node *parent = NULL;
2584 int match;
2585
2586 2491
2587 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); 2492 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2588 if (!new) { 2493 if (!new) {
@@ -2591,7 +2496,7 @@ xfs_alloc_busy_insert(
2591 * block, make this a synchronous transaction to insure that 2496 * block, make this a synchronous transaction to insure that
2592 * the block is not reused before this transaction commits. 2497 * the block is not reused before this transaction commits.
2593 */ 2498 */
2594 trace_xfs_alloc_busy(tp, agno, bno, len, 1); 2499 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2595 xfs_trans_set_sync(tp); 2500 xfs_trans_set_sync(tp);
2596 return; 2501 return;
2597 } 2502 }
@@ -2599,66 +2504,28 @@ xfs_alloc_busy_insert(
2599 new->agno = agno; 2504 new->agno = agno;
2600 new->bno = bno; 2505 new->bno = bno;
2601 new->length = len; 2506 new->length = len;
2602 new->tid = xfs_log_get_trans_ident(tp);
2603
2604 INIT_LIST_HEAD(&new->list); 2507 INIT_LIST_HEAD(&new->list);
2605 2508
2606 /* trace before insert to be able to see failed inserts */ 2509 /* trace before insert to be able to see failed inserts */
2607 trace_xfs_alloc_busy(tp, agno, bno, len, 0); 2510 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2608 2511
2609 pag = xfs_perag_get(tp->t_mountp, new->agno); 2512 pag = xfs_perag_get(tp->t_mountp, new->agno);
2610restart:
2611 spin_lock(&pag->pagb_lock); 2513 spin_lock(&pag->pagb_lock);
2612 rbp = &pag->pagb_tree.rb_node; 2514 rbp = &pag->pagb_tree.rb_node;
2613 parent = NULL; 2515 while (*rbp) {
2614 busyp = NULL;
2615 match = 0;
2616 while (*rbp && match >= 0) {
2617 parent = *rbp; 2516 parent = *rbp;
2618 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); 2517 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2619 2518
2620 if (new->bno < busyp->bno) { 2519 if (new->bno < busyp->bno) {
2621 /* may overlap, but exact start block is lower */
2622 rbp = &(*rbp)->rb_left; 2520 rbp = &(*rbp)->rb_left;
2623 if (new->bno + new->length > busyp->bno) 2521 ASSERT(new->bno + new->length <= busyp->bno);
2624 match = busyp->tid == new->tid ? 1 : -1;
2625 } else if (new->bno > busyp->bno) { 2522 } else if (new->bno > busyp->bno) {
2626 /* may overlap, but exact start block is higher */
2627 rbp = &(*rbp)->rb_right; 2523 rbp = &(*rbp)->rb_right;
2628 if (bno < busyp->bno + busyp->length) 2524 ASSERT(bno >= busyp->bno + busyp->length);
2629 match = busyp->tid == new->tid ? 1 : -1;
2630 } else { 2525 } else {
2631 match = busyp->tid == new->tid ? 1 : -1; 2526 ASSERT(0);
2632 break;
2633 } 2527 }
2634 } 2528 }
2635 if (match < 0) {
2636 /* overlap marked busy in different transaction */
2637 spin_unlock(&pag->pagb_lock);
2638 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2639 goto restart;
2640 }
2641 if (match > 0) {
2642 /*
2643 * overlap marked busy in same transaction. Update if exact
2644 * start block match, otherwise combine the busy extents into
2645 * a single range.
2646 */
2647 if (busyp->bno == new->bno) {
2648 busyp->length = max(busyp->length, new->length);
2649 spin_unlock(&pag->pagb_lock);
2650 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2651 xfs_perag_put(pag);
2652 kmem_free(new);
2653 return;
2654 }
2655 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2656 new->length = max(busyp->bno + busyp->length,
2657 new->bno + new->length) -
2658 min(busyp->bno, new->bno);
2659 new->bno = min(busyp->bno, new->bno);
2660 } else
2661 busyp = NULL;
2662 2529
2663 rb_link_node(&new->rb_node, parent, rbp); 2530 rb_link_node(&new->rb_node, parent, rbp);
2664 rb_insert_color(&new->rb_node, &pag->pagb_tree); 2531 rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2666,7 +2533,6 @@ restart:
2666 list_add(&new->list, &tp->t_busy); 2533 list_add(&new->list, &tp->t_busy);
2667 spin_unlock(&pag->pagb_lock); 2534 spin_unlock(&pag->pagb_lock);
2668 xfs_perag_put(pag); 2535 xfs_perag_put(pag);
2669 kmem_free(busyp);
2670} 2536}
2671 2537
2672/* 2538/*
@@ -2715,12 +2581,196 @@ xfs_alloc_busy_search(
2715 } 2581 }
2716 } 2582 }
2717 spin_unlock(&pag->pagb_lock); 2583 spin_unlock(&pag->pagb_lock);
2718 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2719 xfs_perag_put(pag); 2584 xfs_perag_put(pag);
2720 return match; 2585 return match;
2721} 2586}
2722 2587
2723/* 2588/*
2589 * The found free extent [fbno, fend] overlaps part or all of the given busy
2590 * extent. If the overlap covers the beginning, the end, or all of the busy
2591 * extent, the overlapping portion can be made unbusy and used for the
2592 * allocation. We can't split a busy extent because we can't modify a
2593 * transaction/CIL context busy list, but we can update an entries block
2594 * number or length.
2595 *
2596 * Returns true if the extent can safely be reused, or false if the search
2597 * needs to be restarted.
2598 */
2599STATIC bool
2600xfs_alloc_busy_update_extent(
2601 struct xfs_mount *mp,
2602 struct xfs_perag *pag,
2603 struct xfs_busy_extent *busyp,
2604 xfs_agblock_t fbno,
2605 xfs_extlen_t flen,
2606 bool userdata)
2607{
2608 xfs_agblock_t fend = fbno + flen;
2609 xfs_agblock_t bbno = busyp->bno;
2610 xfs_agblock_t bend = bbno + busyp->length;
2611
2612 /*
2613 * If there is a busy extent overlapping a user allocation, we have
2614 * no choice but to force the log and retry the search.
2615 *
2616 * Fortunately this does not happen during normal operation, but
2617 * only if the filesystem is very low on space and has to dip into
2618 * the AGFL for normal allocations.
2619 */
2620 if (userdata)
2621 goto out_force_log;
2622
2623 if (bbno < fbno && bend > fend) {
2624 /*
2625 * Case 1:
2626 * bbno bend
2627 * +BBBBBBBBBBBBBBBBB+
2628 * +---------+
2629 * fbno fend
2630 */
2631
2632 /*
2633 * We would have to split the busy extent to be able to track
2634 * it correct, which we cannot do because we would have to
2635 * modify the list of busy extents attached to the transaction
2636 * or CIL context, which is immutable.
2637 *
2638 * Force out the log to clear the busy extent and retry the
2639 * search.
2640 */
2641 goto out_force_log;
2642 } else if (bbno >= fbno && bend <= fend) {
2643 /*
2644 * Case 2:
2645 * bbno bend
2646 * +BBBBBBBBBBBBBBBBB+
2647 * +-----------------+
2648 * fbno fend
2649 *
2650 * Case 3:
2651 * bbno bend
2652 * +BBBBBBBBBBBBBBBBB+
2653 * +--------------------------+
2654 * fbno fend
2655 *
2656 * Case 4:
2657 * bbno bend
2658 * +BBBBBBBBBBBBBBBBB+
2659 * +--------------------------+
2660 * fbno fend
2661 *
2662 * Case 5:
2663 * bbno bend
2664 * +BBBBBBBBBBBBBBBBB+
2665 * +-----------------------------------+
2666 * fbno fend
2667 *
2668 */
2669
2670 /*
2671 * The busy extent is fully covered by the extent we are
2672 * allocating, and can simply be removed from the rbtree.
2673 * However we cannot remove it from the immutable list
2674 * tracking busy extents in the transaction or CIL context,
2675 * so set the length to zero to mark it invalid.
2676 *
2677 * We also need to restart the busy extent search from the
2678 * tree root, because erasing the node can rearrange the
2679 * tree topology.
2680 */
2681 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2682 busyp->length = 0;
2683 return false;
2684 } else if (fend < bend) {
2685 /*
2686 * Case 6:
2687 * bbno bend
2688 * +BBBBBBBBBBBBBBBBB+
2689 * +---------+
2690 * fbno fend
2691 *
2692 * Case 7:
2693 * bbno bend
2694 * +BBBBBBBBBBBBBBBBB+
2695 * +------------------+
2696 * fbno fend
2697 *
2698 */
2699 busyp->bno = fend;
2700 } else if (bbno < fbno) {
2701 /*
2702 * Case 8:
2703 * bbno bend
2704 * +BBBBBBBBBBBBBBBBB+
2705 * +-------------+
2706 * fbno fend
2707 *
2708 * Case 9:
2709 * bbno bend
2710 * +BBBBBBBBBBBBBBBBB+
2711 * +----------------------+
2712 * fbno fend
2713 */
2714 busyp->length = fbno - busyp->bno;
2715 } else {
2716 ASSERT(0);
2717 }
2718
2719 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2720 return true;
2721
2722out_force_log:
2723 spin_unlock(&pag->pagb_lock);
2724 xfs_log_force(mp, XFS_LOG_SYNC);
2725 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2726 spin_lock(&pag->pagb_lock);
2727 return false;
2728}
2729
2730
2731/*
2732 * For a given extent [fbno, flen], make sure we can reuse it safely.
2733 */
2734void
2735xfs_alloc_busy_reuse(
2736 struct xfs_mount *mp,
2737 xfs_agnumber_t agno,
2738 xfs_agblock_t fbno,
2739 xfs_extlen_t flen,
2740 bool userdata)
2741{
2742 struct xfs_perag *pag;
2743 struct rb_node *rbp;
2744
2745 ASSERT(flen > 0);
2746
2747 pag = xfs_perag_get(mp, agno);
2748 spin_lock(&pag->pagb_lock);
2749restart:
2750 rbp = pag->pagb_tree.rb_node;
2751 while (rbp) {
2752 struct xfs_busy_extent *busyp =
2753 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2754 xfs_agblock_t bbno = busyp->bno;
2755 xfs_agblock_t bend = bbno + busyp->length;
2756
2757 if (fbno + flen <= bbno) {
2758 rbp = rbp->rb_left;
2759 continue;
2760 } else if (fbno >= bend) {
2761 rbp = rbp->rb_right;
2762 continue;
2763 }
2764
2765 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2766 userdata))
2767 goto restart;
2768 }
2769 spin_unlock(&pag->pagb_lock);
2770 xfs_perag_put(pag);
2771}
2772
2773/*
2724 * For a given extent [fbno, flen], search the busy extent list to find a 2774 * For a given extent [fbno, flen], search the busy extent list to find a
2725 * subset of the extent that is not busy. If *rlen is smaller than 2775 * subset of the extent that is not busy. If *rlen is smaller than
2726 * args->minlen no suitable extent could be found, and the higher level 2776 * args->minlen no suitable extent could be found, and the higher level
@@ -2734,13 +2784,16 @@ xfs_alloc_busy_trim(
2734 xfs_agblock_t *rbno, 2784 xfs_agblock_t *rbno,
2735 xfs_extlen_t *rlen) 2785 xfs_extlen_t *rlen)
2736{ 2786{
2737 xfs_agblock_t fbno = bno; 2787 xfs_agblock_t fbno;
2738 xfs_extlen_t flen = len; 2788 xfs_extlen_t flen;
2739 struct rb_node *rbp; 2789 struct rb_node *rbp;
2740 2790
2741 ASSERT(flen > 0); 2791 ASSERT(len > 0);
2742 2792
2743 spin_lock(&args->pag->pagb_lock); 2793 spin_lock(&args->pag->pagb_lock);
2794restart:
2795 fbno = bno;
2796 flen = len;
2744 rbp = args->pag->pagb_tree.rb_node; 2797 rbp = args->pag->pagb_tree.rb_node;
2745 while (rbp && flen >= args->minlen) { 2798 while (rbp && flen >= args->minlen) {
2746 struct xfs_busy_extent *busyp = 2799 struct xfs_busy_extent *busyp =
@@ -2757,6 +2810,18 @@ xfs_alloc_busy_trim(
2757 continue; 2810 continue;
2758 } 2811 }
2759 2812
2813 /*
2814 * If this is a metadata allocation, try to reuse the busy
2815 * extent instead of trimming the allocation.
2816 */
2817 if (!args->userdata) {
2818 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2819 busyp, fbno, flen,
2820 false))
2821 goto restart;
2822 continue;
2823 }
2824
2760 if (bbno <= fbno) { 2825 if (bbno <= fbno) {
2761 /* start overlap */ 2826 /* start overlap */
2762 2827
@@ -2906,17 +2971,15 @@ xfs_alloc_busy_clear(
2906{ 2971{
2907 struct xfs_perag *pag; 2972 struct xfs_perag *pag;
2908 2973
2909 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2910 busyp->length);
2911
2912 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2913 busyp->length) == 1);
2914
2915 list_del_init(&busyp->list); 2974 list_del_init(&busyp->list);
2916 2975
2917 pag = xfs_perag_get(mp, busyp->agno); 2976 pag = xfs_perag_get(mp, busyp->agno);
2918 spin_lock(&pag->pagb_lock); 2977 spin_lock(&pag->pagb_lock);
2919 rb_erase(&busyp->rb_node, &pag->pagb_tree); 2978 if (busyp->length) {
2979 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
2980 busyp->length);
2981 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2982 }
2920 spin_unlock(&pag->pagb_lock); 2983 spin_unlock(&pag->pagb_lock);
2921 xfs_perag_put(pag); 2984 xfs_perag_put(pag);
2922 2985