aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c1029
1 files changed, 699 insertions, 330 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9bf..95862bbff56b 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44/*
45 * Prototypes for per-ag allocation routines
46 */
47
48STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 44STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
49STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 45STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
50STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 46STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
51STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, 47STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
52 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); 48 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
53 49STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
54/* 50 xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
55 * Internal functions.
56 */
57 51
58/* 52/*
59 * Lookup the record equal to [bno, len] in the btree given by cur. 53 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -147,27 +141,28 @@ xfs_alloc_get_rec(
147 */ 141 */
148STATIC void 142STATIC void
149xfs_alloc_compute_aligned( 143xfs_alloc_compute_aligned(
144 xfs_alloc_arg_t *args, /* allocation argument structure */
150 xfs_agblock_t foundbno, /* starting block in found extent */ 145 xfs_agblock_t foundbno, /* starting block in found extent */
151 xfs_extlen_t foundlen, /* length in found extent */ 146 xfs_extlen_t foundlen, /* length in found extent */
152 xfs_extlen_t alignment, /* alignment for allocation */
153 xfs_extlen_t minlen, /* minimum length for allocation */
154 xfs_agblock_t *resbno, /* result block number */ 147 xfs_agblock_t *resbno, /* result block number */
155 xfs_extlen_t *reslen) /* result length */ 148 xfs_extlen_t *reslen) /* result length */
156{ 149{
157 xfs_agblock_t bno; 150 xfs_agblock_t bno;
158 xfs_extlen_t diff;
159 xfs_extlen_t len; 151 xfs_extlen_t len;
160 152
161 if (alignment > 1 && foundlen >= minlen) { 153 /* Trim busy sections out of found extent */
162 bno = roundup(foundbno, alignment); 154 xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
163 diff = bno - foundbno; 155
164 len = diff >= foundlen ? 0 : foundlen - diff; 156 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno;
159
160 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff;
165 } else { 162 } else {
166 bno = foundbno; 163 *resbno = bno;
167 len = foundlen; 164 *reslen = len;
168 } 165 }
169 *resbno = bno;
170 *reslen = len;
171} 166}
172 167
173/* 168/*
@@ -281,7 +276,6 @@ xfs_alloc_fix_minleft(
281 return 1; 276 return 1;
282 agf = XFS_BUF_TO_AGF(args->agbp); 277 agf = XFS_BUF_TO_AGF(args->agbp);
283 diff = be32_to_cpu(agf->agf_freeblks) 278 diff = be32_to_cpu(agf->agf_freeblks)
284 + be32_to_cpu(agf->agf_flcount)
285 - args->len - args->minleft; 279 - args->len - args->minleft;
286 if (diff >= 0) 280 if (diff >= 0)
287 return 1; 281 return 1;
@@ -464,6 +458,27 @@ xfs_alloc_read_agfl(
464 return 0; 458 return 0;
465} 459}
466 460
461STATIC int
462xfs_alloc_update_counters(
463 struct xfs_trans *tp,
464 struct xfs_perag *pag,
465 struct xfs_buf *agbp,
466 long len)
467{
468 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
469
470 pag->pagf_freeblks += len;
471 be32_add_cpu(&agf->agf_freeblks, len);
472
473 xfs_trans_agblocks_delta(tp, len);
474 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
475 be32_to_cpu(agf->agf_length)))
476 return EFSCORRUPTED;
477
478 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
479 return 0;
480}
481
467/* 482/*
468 * Allocation group level functions. 483 * Allocation group level functions.
469 */ 484 */
@@ -505,49 +520,36 @@ xfs_alloc_ag_vextent(
505 ASSERT(0); 520 ASSERT(0);
506 /* NOTREACHED */ 521 /* NOTREACHED */
507 } 522 }
508 if (error) 523
524 if (error || args->agbno == NULLAGBLOCK)
509 return error; 525 return error;
510 /*
511 * If the allocation worked, need to change the agf structure
512 * (and log it), and the superblock.
513 */
514 if (args->agbno != NULLAGBLOCK) {
515 xfs_agf_t *agf; /* allocation group freelist header */
516 long slen = (long)args->len;
517 526
518 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 527 ASSERT(args->len >= args->minlen);
519 ASSERT(!(args->wasfromfl) || !args->isfl); 528 ASSERT(args->len <= args->maxlen);
520 ASSERT(args->agbno % args->alignment == 0); 529 ASSERT(!args->wasfromfl || !args->isfl);
521 if (!(args->wasfromfl)) { 530 ASSERT(args->agbno % args->alignment == 0);
522 531
523 agf = XFS_BUF_TO_AGF(args->agbp); 532 if (!args->wasfromfl) {
524 be32_add_cpu(&agf->agf_freeblks, -(args->len)); 533 error = xfs_alloc_update_counters(args->tp, args->pag,
525 xfs_trans_agblocks_delta(args->tp, 534 args->agbp,
526 -((long)(args->len))); 535 -((long)(args->len)));
527 args->pag->pagf_freeblks -= args->len; 536 if (error)
528 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 537 return error;
529 be32_to_cpu(agf->agf_length)); 538
530 xfs_alloc_log_agf(args->tp, args->agbp, 539 ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
531 XFS_AGF_FREEBLKS); 540 args->agbno, args->len));
532 /*
533 * Search the busylist for these blocks and mark the
534 * transaction as synchronous if blocks are found. This
535 * avoids the need to block due to a synchronous log
536 * force to ensure correct ordering as the synchronous
537 * transaction will guarantee that for us.
538 */
539 if (xfs_alloc_busy_search(args->mp, args->agno,
540 args->agbno, args->len))
541 xfs_trans_set_sync(args->tp);
542 }
543 if (!args->isfl)
544 xfs_trans_mod_sb(args->tp,
545 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
546 XFS_TRANS_SB_FDBLOCKS, -slen);
547 XFS_STATS_INC(xs_allocx);
548 XFS_STATS_ADD(xs_allocb, args->len);
549 } 541 }
550 return 0; 542
543 if (!args->isfl) {
544 xfs_trans_mod_sb(args->tp, args->wasdel ?
545 XFS_TRANS_SB_RES_FDBLOCKS :
546 XFS_TRANS_SB_FDBLOCKS,
547 -((long)(args->len)));
548 }
549
550 XFS_STATS_INC(xs_allocx);
551 XFS_STATS_ADD(xs_allocb, args->len);
552 return error;
551} 553}
552 554
553/* 555/*
@@ -562,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
562{ 564{
563 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ 565 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
564 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ 566 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
565 xfs_agblock_t end; /* end of allocated extent */
566 int error; 567 int error;
567 xfs_agblock_t fbno; /* start block of found extent */ 568 xfs_agblock_t fbno; /* start block of found extent */
568 xfs_agblock_t fend; /* end block of found extent */
569 xfs_extlen_t flen; /* length of found extent */ 569 xfs_extlen_t flen; /* length of found extent */
570 xfs_agblock_t tbno; /* start block of trimmed extent */
571 xfs_extlen_t tlen; /* length of trimmed extent */
572 xfs_agblock_t tend; /* end block of trimmed extent */
573 xfs_agblock_t end; /* end of allocated extent */
570 int i; /* success/failure of operation */ 574 int i; /* success/failure of operation */
571 xfs_agblock_t maxend; /* end of maximal extent */
572 xfs_agblock_t minend; /* end of minimal extent */
573 xfs_extlen_t rlen; /* length of returned extent */ 575 xfs_extlen_t rlen; /* length of returned extent */
574 576
575 ASSERT(args->alignment == 1); 577 ASSERT(args->alignment == 1);
@@ -599,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
599 goto error0; 601 goto error0;
600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 602 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
601 ASSERT(fbno <= args->agbno); 603 ASSERT(fbno <= args->agbno);
602 minend = args->agbno + args->minlen;
603 maxend = args->agbno + args->maxlen;
604 fend = fbno + flen;
605 604
606 /* 605 /*
607 * Give up if the freespace isn't long enough for the minimum request. 606 * Check for overlapping busy extents.
608 */ 607 */
609 if (fend < minend) 608 xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
609
610 /*
611 * Give up if the start of the extent is busy, or the freespace isn't
612 * long enough for the minimum request.
613 */
614 if (tbno > args->agbno)
615 goto not_found;
616 if (tlen < args->minlen)
617 goto not_found;
618 tend = tbno + tlen;
619 if (tend < args->agbno + args->minlen)
610 goto not_found; 620 goto not_found;
611 621
612 /* 622 /*
@@ -615,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
615 * 625 *
616 * Fix the length according to mod and prod if given. 626 * Fix the length according to mod and prod if given.
617 */ 627 */
618 end = XFS_AGBLOCK_MIN(fend, maxend); 628 end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
619 args->len = end - args->agbno; 629 args->len = end - args->agbno;
620 xfs_alloc_fix_len(args); 630 xfs_alloc_fix_len(args);
621 if (!xfs_alloc_fix_minleft(args)) 631 if (!xfs_alloc_fix_minleft(args))
622 goto not_found; 632 goto not_found;
623 633
624 rlen = args->len; 634 rlen = args->len;
625 ASSERT(args->agbno + rlen <= fend); 635 ASSERT(args->agbno + rlen <= tend);
626 end = args->agbno + rlen; 636 end = args->agbno + rlen;
627 637
628 /* 638 /*
@@ -671,11 +681,11 @@ xfs_alloc_find_best_extent(
671 struct xfs_btree_cur **scur, /* searching cursor */ 681 struct xfs_btree_cur **scur, /* searching cursor */
672 xfs_agblock_t gdiff, /* difference for search comparison */ 682 xfs_agblock_t gdiff, /* difference for search comparison */
673 xfs_agblock_t *sbno, /* extent found by search */ 683 xfs_agblock_t *sbno, /* extent found by search */
674 xfs_extlen_t *slen, 684 xfs_extlen_t *slen, /* extent length */
675 xfs_extlen_t *slena, /* aligned length */ 685 xfs_agblock_t *sbnoa, /* aligned extent found by search */
686 xfs_extlen_t *slena, /* aligned extent length */
676 int dir) /* 0 = search right, 1 = search left */ 687 int dir) /* 0 = search right, 1 = search left */
677{ 688{
678 xfs_agblock_t bno;
679 xfs_agblock_t new; 689 xfs_agblock_t new;
680 xfs_agblock_t sdiff; 690 xfs_agblock_t sdiff;
681 int error; 691 int error;
@@ -693,17 +703,16 @@ xfs_alloc_find_best_extent(
693 if (error) 703 if (error)
694 goto error0; 704 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 705 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment, 706 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
697 args->minlen, &bno, slena);
698 707
699 /* 708 /*
700 * The good extent is closer than this one. 709 * The good extent is closer than this one.
701 */ 710 */
702 if (!dir) { 711 if (!dir) {
703 if (bno >= args->agbno + gdiff) 712 if (*sbnoa >= args->agbno + gdiff)
704 goto out_use_good; 713 goto out_use_good;
705 } else { 714 } else {
706 if (bno <= args->agbno - gdiff) 715 if (*sbnoa <= args->agbno - gdiff)
707 goto out_use_good; 716 goto out_use_good;
708 } 717 }
709 718
@@ -715,8 +724,8 @@ xfs_alloc_find_best_extent(
715 xfs_alloc_fix_len(args); 724 xfs_alloc_fix_len(args);
716 725
717 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 726 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
718 args->alignment, *sbno, 727 args->alignment, *sbnoa,
719 *slen, &new); 728 *slena, &new);
720 729
721 /* 730 /*
722 * Choose closer size and invalidate other cursor. 731 * Choose closer size and invalidate other cursor.
@@ -766,7 +775,7 @@ xfs_alloc_ag_vextent_near(
766 xfs_agblock_t gtbnoa; /* aligned ... */ 775 xfs_agblock_t gtbnoa; /* aligned ... */
767 xfs_extlen_t gtdiff; /* difference to right side entry */ 776 xfs_extlen_t gtdiff; /* difference to right side entry */
768 xfs_extlen_t gtlen; /* length of right side entry */ 777 xfs_extlen_t gtlen; /* length of right side entry */
769 xfs_extlen_t gtlena = 0; /* aligned ... */ 778 xfs_extlen_t gtlena; /* aligned ... */
770 xfs_agblock_t gtnew; /* useful start bno of right side */ 779 xfs_agblock_t gtnew; /* useful start bno of right side */
771 int error; /* error code */ 780 int error; /* error code */
772 int i; /* result code, temporary */ 781 int i; /* result code, temporary */
@@ -775,9 +784,10 @@ xfs_alloc_ag_vextent_near(
775 xfs_agblock_t ltbnoa; /* aligned ... */ 784 xfs_agblock_t ltbnoa; /* aligned ... */
776 xfs_extlen_t ltdiff; /* difference to left side entry */ 785 xfs_extlen_t ltdiff; /* difference to left side entry */
777 xfs_extlen_t ltlen; /* length of left side entry */ 786 xfs_extlen_t ltlen; /* length of left side entry */
778 xfs_extlen_t ltlena = 0; /* aligned ... */ 787 xfs_extlen_t ltlena; /* aligned ... */
779 xfs_agblock_t ltnew; /* useful start bno of left side */ 788 xfs_agblock_t ltnew; /* useful start bno of left side */
780 xfs_extlen_t rlen; /* length of returned extent */ 789 xfs_extlen_t rlen; /* length of returned extent */
790 int forced = 0;
781#if defined(DEBUG) && defined(__KERNEL__) 791#if defined(DEBUG) && defined(__KERNEL__)
782 /* 792 /*
783 * Randomly don't execute the first algorithm. 793 * Randomly don't execute the first algorithm.
@@ -786,13 +796,20 @@ xfs_alloc_ag_vextent_near(
786 796
787 dofirst = random32() & 1; 797 dofirst = random32() & 1;
788#endif 798#endif
799
800restart:
801 bno_cur_lt = NULL;
802 bno_cur_gt = NULL;
803 ltlen = 0;
804 gtlena = 0;
805 ltlena = 0;
806
789 /* 807 /*
790 * Get a cursor for the by-size btree. 808 * Get a cursor for the by-size btree.
791 */ 809 */
792 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 810 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
793 args->agno, XFS_BTNUM_CNT); 811 args->agno, XFS_BTNUM_CNT);
794 ltlen = 0; 812
795 bno_cur_lt = bno_cur_gt = NULL;
796 /* 813 /*
797 * See if there are any free extents as big as maxlen. 814 * See if there are any free extents as big as maxlen.
798 */ 815 */
@@ -808,11 +825,13 @@ xfs_alloc_ag_vextent_near(
808 goto error0; 825 goto error0;
809 if (i == 0 || ltlen == 0) { 826 if (i == 0 || ltlen == 0) {
810 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 827 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
828 trace_xfs_alloc_near_noentry(args);
811 return 0; 829 return 0;
812 } 830 }
813 ASSERT(i == 1); 831 ASSERT(i == 1);
814 } 832 }
815 args->wasfromfl = 0; 833 args->wasfromfl = 0;
834
816 /* 835 /*
817 * First algorithm. 836 * First algorithm.
818 * If the requested extent is large wrt the freespaces available 837 * If the requested extent is large wrt the freespaces available
@@ -866,8 +885,8 @@ xfs_alloc_ag_vextent_near(
866 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 885 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
867 goto error0; 886 goto error0;
868 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 887 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
869 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 888 xfs_alloc_compute_aligned(args, ltbno, ltlen,
870 args->minlen, &ltbnoa, &ltlena); 889 &ltbnoa, &ltlena);
871 if (ltlena < args->minlen) 890 if (ltlena < args->minlen)
872 continue; 891 continue;
873 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 892 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -876,7 +895,7 @@ xfs_alloc_ag_vextent_near(
876 if (args->len < blen) 895 if (args->len < blen)
877 continue; 896 continue;
878 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 897 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
879 args->alignment, ltbno, ltlen, &ltnew); 898 args->alignment, ltbnoa, ltlena, &ltnew);
880 if (ltnew != NULLAGBLOCK && 899 if (ltnew != NULLAGBLOCK &&
881 (args->len > blen || ltdiff < bdiff)) { 900 (args->len > blen || ltdiff < bdiff)) {
882 bdiff = ltdiff; 901 bdiff = ltdiff;
@@ -987,8 +1006,8 @@ xfs_alloc_ag_vextent_near(
987 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1006 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
988 goto error0; 1007 goto error0;
989 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1008 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
990 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 1009 xfs_alloc_compute_aligned(args, ltbno, ltlen,
991 args->minlen, &ltbnoa, &ltlena); 1010 &ltbnoa, &ltlena);
992 if (ltlena >= args->minlen) 1011 if (ltlena >= args->minlen)
993 break; 1012 break;
994 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1013 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1022,8 @@ xfs_alloc_ag_vextent_near(
1003 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1022 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1004 goto error0; 1023 goto error0;
1005 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1024 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1006 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, 1025 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1007 args->minlen, &gtbnoa, &gtlena); 1026 &gtbnoa, &gtlena);
1008 if (gtlena >= args->minlen) 1027 if (gtlena >= args->minlen)
1009 break; 1028 break;
1010 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1029 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1028,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
1028 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1047 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1029 xfs_alloc_fix_len(args); 1048 xfs_alloc_fix_len(args);
1030 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1049 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1031 args->alignment, ltbno, ltlen, &ltnew); 1050 args->alignment, ltbnoa, ltlena, &ltnew);
1032 1051
1033 error = xfs_alloc_find_best_extent(args, 1052 error = xfs_alloc_find_best_extent(args,
1034 &bno_cur_lt, &bno_cur_gt, 1053 &bno_cur_lt, &bno_cur_gt,
1035 ltdiff, &gtbno, &gtlen, &gtlena, 1054 ltdiff, &gtbno, &gtlen,
1055 &gtbnoa, &gtlena,
1036 0 /* search right */); 1056 0 /* search right */);
1037 } else { 1057 } else {
1038 ASSERT(gtlena >= args->minlen); 1058 ASSERT(gtlena >= args->minlen);
@@ -1043,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
1043 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1063 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1044 xfs_alloc_fix_len(args); 1064 xfs_alloc_fix_len(args);
1045 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1065 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1046 args->alignment, gtbno, gtlen, &gtnew); 1066 args->alignment, gtbnoa, gtlena, &gtnew);
1047 1067
1048 error = xfs_alloc_find_best_extent(args, 1068 error = xfs_alloc_find_best_extent(args,
1049 &bno_cur_gt, &bno_cur_lt, 1069 &bno_cur_gt, &bno_cur_lt,
1050 gtdiff, &ltbno, &ltlen, &ltlena, 1070 gtdiff, &ltbno, &ltlen,
1071 &ltbnoa, &ltlena,
1051 1 /* search left */); 1072 1 /* search left */);
1052 } 1073 }
1053 1074
@@ -1059,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
1059 * If we couldn't get anything, give up. 1080 * If we couldn't get anything, give up.
1060 */ 1081 */
1061 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1082 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1083 if (!forced++) {
1084 trace_xfs_alloc_near_busy(args);
1085 xfs_log_force(args->mp, XFS_LOG_SYNC);
1086 goto restart;
1087 }
1088
1062 trace_xfs_alloc_size_neither(args); 1089 trace_xfs_alloc_size_neither(args);
1063 args->agbno = NULLAGBLOCK; 1090 args->agbno = NULLAGBLOCK;
1064 return 0; 1091 return 0;
@@ -1093,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
1093 return 0; 1120 return 0;
1094 } 1121 }
1095 rlen = args->len; 1122 rlen = args->len;
1096 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1123 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1097 ltlen, &ltnew); 1124 ltbnoa, ltlena, &ltnew);
1098 ASSERT(ltnew >= ltbno); 1125 ASSERT(ltnew >= ltbno);
1099 ASSERT(ltnew + rlen <= ltbno + ltlen); 1126 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1100 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1127 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1101 args->agbno = ltnew; 1128 args->agbno = ltnew;
1129
1102 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1130 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1103 ltnew, rlen, XFSA_FIXUP_BNO_OK))) 1131 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1104 goto error0; 1132 goto error0;
@@ -1141,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
1141 int i; /* temp status variable */ 1169 int i; /* temp status variable */
1142 xfs_agblock_t rbno; /* returned block number */ 1170 xfs_agblock_t rbno; /* returned block number */
1143 xfs_extlen_t rlen; /* length of returned extent */ 1171 xfs_extlen_t rlen; /* length of returned extent */
1172 int forced = 0;
1144 1173
1174restart:
1145 /* 1175 /*
1146 * Allocate and initialize a cursor for the by-size btree. 1176 * Allocate and initialize a cursor for the by-size btree.
1147 */ 1177 */
1148 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1178 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1149 args->agno, XFS_BTNUM_CNT); 1179 args->agno, XFS_BTNUM_CNT);
1150 bno_cur = NULL; 1180 bno_cur = NULL;
1181
1151 /* 1182 /*
1152 * Look for an entry >= maxlen+alignment-1 blocks. 1183 * Look for an entry >= maxlen+alignment-1 blocks.
1153 */ 1184 */
1154 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1185 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1155 args->maxlen + args->alignment - 1, &i))) 1186 args->maxlen + args->alignment - 1, &i)))
1156 goto error0; 1187 goto error0;
1188
1157 /* 1189 /*
1158 * If none, then pick up the last entry in the tree unless the 1190 * If none or we have busy extents that we cannot allocate from, then
1159 * tree is empty. 1191 * we have to settle for a smaller extent. In the case that there are
1192 * no large extents, this will return the last entry in the tree unless
1193 * the tree is empty. In the case that there are only busy large
1194 * extents, this will return the largest small extent unless there
1195 * are no smaller extents available.
1160 */ 1196 */
1161 if (!i) { 1197 if (!i || forced > 1) {
1162 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, 1198 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1163 &flen, &i))) 1199 &fbno, &flen, &i);
1200 if (error)
1164 goto error0; 1201 goto error0;
1165 if (i == 0 || flen == 0) { 1202 if (i == 0 || flen == 0) {
1166 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1203 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1168,23 +1205,56 @@ xfs_alloc_ag_vextent_size(
1168 return 0; 1205 return 0;
1169 } 1206 }
1170 ASSERT(i == 1); 1207 ASSERT(i == 1);
1208 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1209 } else {
1210 /*
1211 * Search for a non-busy extent that is large enough.
1212 * If we are at low space, don't check, or if we fall of
1213 * the end of the btree, turn off the busy check and
1214 * restart.
1215 */
1216 for (;;) {
1217 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1218 if (error)
1219 goto error0;
1220 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1221
1222 xfs_alloc_compute_aligned(args, fbno, flen,
1223 &rbno, &rlen);
1224
1225 if (rlen >= args->maxlen)
1226 break;
1227
1228 error = xfs_btree_increment(cnt_cur, 0, &i);
1229 if (error)
1230 goto error0;
1231 if (i == 0) {
1232 /*
1233 * Our only valid extents must have been busy.
1234 * Make it unbusy by forcing the log out and
1235 * retrying. If we've been here before, forcing
1236 * the log isn't making the extents available,
1237 * which means they have probably been freed in
1238 * this transaction. In that case, we have to
1239 * give up on them and we'll attempt a minlen
1240 * allocation the next time around.
1241 */
1242 xfs_btree_del_cursor(cnt_cur,
1243 XFS_BTREE_NOERROR);
1244 trace_xfs_alloc_size_busy(args);
1245 if (!forced++)
1246 xfs_log_force(args->mp, XFS_LOG_SYNC);
1247 goto restart;
1248 }
1249 }
1171 } 1250 }
1172 /* 1251
1173 * There's a freespace as big as maxlen+alignment-1, get it.
1174 */
1175 else {
1176 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
1177 goto error0;
1178 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1179 }
1180 /* 1252 /*
1181 * In the first case above, we got the last entry in the 1253 * In the first case above, we got the last entry in the
1182 * by-size btree. Now we check to see if the space hits maxlen 1254 * by-size btree. Now we check to see if the space hits maxlen
1183 * once aligned; if not, we search left for something better. 1255 * once aligned; if not, we search left for something better.
1184 * This can't happen in the second case above. 1256 * This can't happen in the second case above.
1185 */ 1257 */
1186 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen,
1187 &rbno, &rlen);
1188 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1258 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1189 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1259 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1190 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1260 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1279,8 @@ xfs_alloc_ag_vextent_size(
1209 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1279 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1210 if (flen < bestrlen) 1280 if (flen < bestrlen)
1211 break; 1281 break;
1212 xfs_alloc_compute_aligned(fbno, flen, args->alignment, 1282 xfs_alloc_compute_aligned(args, fbno, flen,
1213 args->minlen, &rbno, &rlen); 1283 &rbno, &rlen);
1214 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1284 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1215 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1285 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1216 (rlen <= flen && rbno + rlen <= fbno + flen), 1286 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1238,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
1238 * Fix up the length. 1308 * Fix up the length.
1239 */ 1309 */
1240 args->len = rlen; 1310 args->len = rlen;
1241 xfs_alloc_fix_len(args); 1311 if (rlen < args->minlen) {
1242 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { 1312 if (!forced++) {
1243 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1313 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1244 trace_xfs_alloc_size_nominleft(args); 1314 trace_xfs_alloc_size_busy(args);
1245 args->agbno = NULLAGBLOCK; 1315 xfs_log_force(args->mp, XFS_LOG_SYNC);
1246 return 0; 1316 goto restart;
1317 }
1318 goto out_nominleft;
1247 } 1319 }
1320 xfs_alloc_fix_len(args);
1321
1322 if (!xfs_alloc_fix_minleft(args))
1323 goto out_nominleft;
1248 rlen = args->len; 1324 rlen = args->len;
1249 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); 1325 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1250 /* 1326 /*
@@ -1274,6 +1350,12 @@ error0:
1274 if (bno_cur) 1350 if (bno_cur)
1275 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1351 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1276 return error; 1352 return error;
1353
1354out_nominleft:
1355 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1356 trace_xfs_alloc_size_nominleft(args);
1357 args->agbno = NULLAGBLOCK;
1358 return 0;
1277} 1359}
1278 1360
1279/* 1361/*
@@ -1313,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
1313 if (error) 1395 if (error)
1314 goto error0; 1396 goto error0;
1315 if (fbno != NULLAGBLOCK) { 1397 if (fbno != NULLAGBLOCK) {
1398 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
1399 args->userdata);
1400
1316 if (args->userdata) { 1401 if (args->userdata) {
1317 xfs_buf_t *bp; 1402 xfs_buf_t *bp;
1318 1403
@@ -1388,6 +1473,7 @@ xfs_free_ag_extent(
1388 xfs_mount_t *mp; /* mount point struct for filesystem */ 1473 xfs_mount_t *mp; /* mount point struct for filesystem */
1389 xfs_agblock_t nbno; /* new starting block of freespace */ 1474 xfs_agblock_t nbno; /* new starting block of freespace */
1390 xfs_extlen_t nlen; /* new length of freespace */ 1475 xfs_extlen_t nlen; /* new length of freespace */
1476 xfs_perag_t *pag; /* per allocation group data */
1391 1477
1392 mp = tp->t_mountp; 1478 mp = tp->t_mountp;
1393 /* 1479 /*
@@ -1586,45 +1672,23 @@ xfs_free_ag_extent(
1586 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1672 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1587 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1673 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1588 cnt_cur = NULL; 1674 cnt_cur = NULL;
1675
1589 /* 1676 /*
1590 * Update the freespace totals in the ag and superblock. 1677 * Update the freespace totals in the ag and superblock.
1591 */ 1678 */
1592 { 1679 pag = xfs_perag_get(mp, agno);
1593 xfs_agf_t *agf; 1680 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1594 xfs_perag_t *pag; /* per allocation group data */ 1681 xfs_perag_put(pag);
1595 1682 if (error)
1596 pag = xfs_perag_get(mp, agno); 1683 goto error0;
1597 pag->pagf_freeblks += len;
1598 xfs_perag_put(pag);
1599 1684
1600 agf = XFS_BUF_TO_AGF(agbp); 1685 if (!isfl)
1601 be32_add_cpu(&agf->agf_freeblks, len); 1686 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1602 xfs_trans_agblocks_delta(tp, len); 1687 XFS_STATS_INC(xs_freex);
1603 XFS_WANT_CORRUPTED_GOTO( 1688 XFS_STATS_ADD(xs_freeb, len);
1604 be32_to_cpu(agf->agf_freeblks) <=
1605 be32_to_cpu(agf->agf_length),
1606 error0);
1607 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1608 if (!isfl)
1609 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1610 XFS_STATS_INC(xs_freex);
1611 XFS_STATS_ADD(xs_freeb, len);
1612 }
1613 1689
1614 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1690 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1615 1691
1616 /*
1617 * Since blocks move to the free list without the coordination
1618 * used in xfs_bmap_finish, we can't allow block to be available
1619 * for reallocation and non-transaction writing (user data)
1620 * until we know that the transaction that moved it to the free
1621 * list is permanently on disk. We track the blocks by declaring
1622 * these blocks as "busy"; the busy list is maintained on a per-ag
1623 * basis and each transaction records which entries should be removed
1624 * when the iclog commits to disk. If a busy block is allocated,
1625 * the iclog is pushed up to the LSN that freed the block.
1626 */
1627 xfs_alloc_busy_insert(tp, agno, bno, len);
1628 return 0; 1692 return 0;
1629 1693
1630 error0: 1694 error0:
@@ -1919,21 +1983,6 @@ xfs_alloc_get_freelist(
1919 xfs_alloc_log_agf(tp, agbp, logflags); 1983 xfs_alloc_log_agf(tp, agbp, logflags);
1920 *bnop = bno; 1984 *bnop = bno;
1921 1985
1922 /*
1923 * As blocks are freed, they are added to the per-ag busy list and
1924 * remain there until the freeing transaction is committed to disk.
1925 * Now that we have allocated blocks, this list must be searched to see
1926 * if a block is being reused. If one is, then the freeing transaction
1927 * must be pushed to disk before this transaction.
1928 *
1929 * We do this by setting the current transaction to a sync transaction
1930 * which guarantees that the freeing transaction is on disk before this
1931 * transaction. This is done instead of a synchronous log force here so
1932 * that we don't sit and wait with the AGF locked in the transaction
1933 * during the log force.
1934 */
1935 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
1936 xfs_trans_set_sync(tp);
1937 return 0; 1986 return 0;
1938} 1987}
1939 1988
@@ -2391,131 +2440,54 @@ xfs_free_extent(
2391 memset(&args, 0, sizeof(xfs_alloc_arg_t)); 2440 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2392 args.tp = tp; 2441 args.tp = tp;
2393 args.mp = tp->t_mountp; 2442 args.mp = tp->t_mountp;
2443
2444 /*
2445 * validate that the block number is legal - the enables us to detect
2446 * and handle a silent filesystem corruption rather than crashing.
2447 */
2394 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2448 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2395 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2449 if (args.agno >= args.mp->m_sb.sb_agcount)
2450 return EFSCORRUPTED;
2451
2396 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2452 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2453 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2454 return EFSCORRUPTED;
2455
2397 args.pag = xfs_perag_get(args.mp, args.agno); 2456 args.pag = xfs_perag_get(args.mp, args.agno);
2398 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2457 ASSERT(args.pag);
2458
2459 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2460 if (error)
2399 goto error0; 2461 goto error0;
2400#ifdef DEBUG 2462
2401 ASSERT(args.agbp != NULL); 2463 /* validate the extent size is legal now we have the agf locked */
2402 ASSERT((args.agbno + len) <= 2464 if (args.agbno + len >
2403 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); 2465 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2404#endif 2466 error = EFSCORRUPTED;
2467 goto error0;
2468 }
2469
2405 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len, 0);
2406error0: 2473error0:
2407 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2408 return error; 2475 return error;
2409} 2476}
2410 2477
2411
2412/*
2413 * AG Busy list management
2414 * The busy list contains block ranges that have been freed but whose
2415 * transactions have not yet hit disk. If any block listed in a busy
2416 * list is reused, the transaction that freed it must be forced to disk
2417 * before continuing to use the block.
2418 *
2419 * xfs_alloc_busy_insert - add to the per-ag busy list
2420 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2421 * xfs_alloc_busy_search - search for a busy extent
2422 */
2423
2424/*
2425 * Insert a new extent into the busy tree.
2426 *
2427 * The busy extent tree is indexed by the start block of the busy extent.
2428 * there can be multiple overlapping ranges in the busy extent tree but only
2429 * ever one entry at a given start block. The reason for this is that
2430 * multi-block extents can be freed, then smaller chunks of that extent
2431 * allocated and freed again before the first transaction commit is on disk.
2432 * If the exact same start block is freed a second time, we have to wait for
2433 * that busy extent to pass out of the tree before the new extent is inserted.
2434 * There are two main cases we have to handle here.
2435 *
2436 * The first case is a transaction that triggers a "free - allocate - free"
2437 * cycle. This can occur during btree manipulations as a btree block is freed
2438 * to the freelist, then allocated from the free list, then freed again. In
2439 * this case, the second extxpnet free is what triggers the duplicate and as
2440 * such the transaction IDs should match. Because the extent was allocated in
2441 * this transaction, the transaction must be marked as synchronous. This is
2442 * true for all cases where the free/alloc/free occurs in the one transaction,
2443 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2444 * This serves to catch violations of the second case quite effectively.
2445 *
2446 * The second case is where the free/alloc/free occur in different
2447 * transactions. In this case, the thread freeing the extent the second time
2448 * can't mark the extent busy immediately because it is already tracked in a
2449 * transaction that may be committing. When the log commit for the existing
2450 * busy extent completes, the busy extent will be removed from the tree. If we
2451 * allow the second busy insert to continue using that busy extent structure,
2452 * it can be freed before this transaction is safely in the log. Hence our
2453 * only option in this case is to force the log to remove the existing busy
2454 * extent from the list before we insert the new one with the current
2455 * transaction ID.
2456 *
2457 * The problem we are trying to avoid in the free-alloc-free in separate
2458 * transactions is most easily described with a timeline:
2459 *
2460 * Thread 1 Thread 2 Thread 3 xfslogd
2461 * xact alloc
2462 * free X
2463 * mark busy
2464 * commit xact
2465 * free xact
2466 * xact alloc
2467 * alloc X
2468 * busy search
2469 * mark xact sync
2470 * commit xact
2471 * free xact
2472 * force log
2473 * checkpoint starts
2474 * ....
2475 * xact alloc
2476 * free X
2477 * mark busy
2478 * finds match
2479 * *** KABOOM! ***
2480 * ....
2481 * log IO completes
2482 * unbusy X
2483 * checkpoint completes
2484 *
2485 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2486 * the checkpoint completes, and the busy extent it matched will have been
2487 * removed from the tree when it is woken. Hence it can then continue safely.
2488 *
2489 * However, to ensure this matching process is robust, we need to use the
2490 * transaction ID for identifying transaction, as delayed logging results in
2491 * the busy extent and transaction lifecycles being different. i.e. the busy
2492 * extent is active for a lot longer than the transaction. Hence the
2493 * transaction structure can be freed and reallocated, then mark the same
2494 * extent busy again in the new transaction. In this case the new transaction
2495 * will have a different tid but can have the same address, and hence we need
2496 * to check against the tid.
2497 *
2498 * Future: for delayed logging, we could avoid the log force if the extent was
2499 * first freed in the current checkpoint sequence. This, however, requires the
2500 * ability to pin the current checkpoint in memory until this transaction
2501 * commits to ensure that both the original free and the current one combine
2502 * logically into the one checkpoint. If the checkpoint sequences are
2503 * different, however, we still need to wait on a log force.
2504 */
2505void 2478void
2506xfs_alloc_busy_insert( 2479xfs_alloc_busy_insert(
2507 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
2508 xfs_agnumber_t agno, 2481 xfs_agnumber_t agno,
2509 xfs_agblock_t bno, 2482 xfs_agblock_t bno,
2510 xfs_extlen_t len) 2483 xfs_extlen_t len,
2484 unsigned int flags)
2511{ 2485{
2512 struct xfs_busy_extent *new; 2486 struct xfs_busy_extent *new;
2513 struct xfs_busy_extent *busyp; 2487 struct xfs_busy_extent *busyp;
2514 struct xfs_perag *pag; 2488 struct xfs_perag *pag;
2515 struct rb_node **rbp; 2489 struct rb_node **rbp;
2516 struct rb_node *parent; 2490 struct rb_node *parent = NULL;
2517 int match;
2518
2519 2491
2520 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); 2492 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2521 if (!new) { 2493 if (!new) {
@@ -2524,7 +2496,7 @@ xfs_alloc_busy_insert(
2524 * block, make this a synchronous transaction to insure that 2496 * block, make this a synchronous transaction to insure that
2525 * the block is not reused before this transaction commits. 2497 * the block is not reused before this transaction commits.
2526 */ 2498 */
2527 trace_xfs_alloc_busy(tp, agno, bno, len, 1); 2499 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2528 xfs_trans_set_sync(tp); 2500 xfs_trans_set_sync(tp);
2529 return; 2501 return;
2530 } 2502 }
@@ -2532,66 +2504,29 @@ xfs_alloc_busy_insert(
2532 new->agno = agno; 2504 new->agno = agno;
2533 new->bno = bno; 2505 new->bno = bno;
2534 new->length = len; 2506 new->length = len;
2535 new->tid = xfs_log_get_trans_ident(tp);
2536
2537 INIT_LIST_HEAD(&new->list); 2507 INIT_LIST_HEAD(&new->list);
2508 new->flags = flags;
2538 2509
2539 /* trace before insert to be able to see failed inserts */ 2510 /* trace before insert to be able to see failed inserts */
2540 trace_xfs_alloc_busy(tp, agno, bno, len, 0); 2511 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2541 2512
2542 pag = xfs_perag_get(tp->t_mountp, new->agno); 2513 pag = xfs_perag_get(tp->t_mountp, new->agno);
2543restart:
2544 spin_lock(&pag->pagb_lock); 2514 spin_lock(&pag->pagb_lock);
2545 rbp = &pag->pagb_tree.rb_node; 2515 rbp = &pag->pagb_tree.rb_node;
2546 parent = NULL; 2516 while (*rbp) {
2547 busyp = NULL;
2548 match = 0;
2549 while (*rbp && match >= 0) {
2550 parent = *rbp; 2517 parent = *rbp;
2551 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); 2518 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2552 2519
2553 if (new->bno < busyp->bno) { 2520 if (new->bno < busyp->bno) {
2554 /* may overlap, but exact start block is lower */
2555 rbp = &(*rbp)->rb_left; 2521 rbp = &(*rbp)->rb_left;
2556 if (new->bno + new->length > busyp->bno) 2522 ASSERT(new->bno + new->length <= busyp->bno);
2557 match = busyp->tid == new->tid ? 1 : -1;
2558 } else if (new->bno > busyp->bno) { 2523 } else if (new->bno > busyp->bno) {
2559 /* may overlap, but exact start block is higher */
2560 rbp = &(*rbp)->rb_right; 2524 rbp = &(*rbp)->rb_right;
2561 if (bno < busyp->bno + busyp->length) 2525 ASSERT(bno >= busyp->bno + busyp->length);
2562 match = busyp->tid == new->tid ? 1 : -1;
2563 } else { 2526 } else {
2564 match = busyp->tid == new->tid ? 1 : -1; 2527 ASSERT(0);
2565 break;
2566 } 2528 }
2567 } 2529 }
2568 if (match < 0) {
2569 /* overlap marked busy in different transaction */
2570 spin_unlock(&pag->pagb_lock);
2571 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2572 goto restart;
2573 }
2574 if (match > 0) {
2575 /*
2576 * overlap marked busy in same transaction. Update if exact
2577 * start block match, otherwise combine the busy extents into
2578 * a single range.
2579 */
2580 if (busyp->bno == new->bno) {
2581 busyp->length = max(busyp->length, new->length);
2582 spin_unlock(&pag->pagb_lock);
2583 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2584 xfs_perag_put(pag);
2585 kmem_free(new);
2586 return;
2587 }
2588 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2589 new->length = max(busyp->bno + busyp->length,
2590 new->bno + new->length) -
2591 min(busyp->bno, new->bno);
2592 new->bno = min(busyp->bno, new->bno);
2593 } else
2594 busyp = NULL;
2595 2530
2596 rb_link_node(&new->rb_node, parent, rbp); 2531 rb_link_node(&new->rb_node, parent, rbp);
2597 rb_insert_color(&new->rb_node, &pag->pagb_tree); 2532 rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2599,7 +2534,6 @@ restart:
2599 list_add(&new->list, &tp->t_busy); 2534 list_add(&new->list, &tp->t_busy);
2600 spin_unlock(&pag->pagb_lock); 2535 spin_unlock(&pag->pagb_lock);
2601 xfs_perag_put(pag); 2536 xfs_perag_put(pag);
2602 kmem_free(busyp);
2603} 2537}
2604 2538
2605/* 2539/*
@@ -2648,31 +2582,466 @@ xfs_alloc_busy_search(
2648 } 2582 }
2649 } 2583 }
2650 spin_unlock(&pag->pagb_lock); 2584 spin_unlock(&pag->pagb_lock);
2651 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2652 xfs_perag_put(pag); 2585 xfs_perag_put(pag);
2653 return match; 2586 return match;
2654} 2587}
2655 2588
2589/*
2590 * The found free extent [fbno, fend] overlaps part or all of the given busy
2591 * extent. If the overlap covers the beginning, the end, or all of the busy
2592 * extent, the overlapping portion can be made unbusy and used for the
2593 * allocation. We can't split a busy extent because we can't modify a
2594 * transaction/CIL context busy list, but we can update an entries block
2595 * number or length.
2596 *
2597 * Returns true if the extent can safely be reused, or false if the search
2598 * needs to be restarted.
2599 */
2600STATIC bool
2601xfs_alloc_busy_update_extent(
2602 struct xfs_mount *mp,
2603 struct xfs_perag *pag,
2604 struct xfs_busy_extent *busyp,
2605 xfs_agblock_t fbno,
2606 xfs_extlen_t flen,
2607 bool userdata)
2608{
2609 xfs_agblock_t fend = fbno + flen;
2610 xfs_agblock_t bbno = busyp->bno;
2611 xfs_agblock_t bend = bbno + busyp->length;
2612
2613 /*
2614 * This extent is currently being discarded. Give the thread
2615 * performing the discard a chance to mark the extent unbusy
2616 * and retry.
2617 */
2618 if (busyp->flags & XFS_ALLOC_BUSY_DISCARDED) {
2619 spin_unlock(&pag->pagb_lock);
2620 delay(1);
2621 spin_lock(&pag->pagb_lock);
2622 return false;
2623 }
2624
2625 /*
2626 * If there is a busy extent overlapping a user allocation, we have
2627 * no choice but to force the log and retry the search.
2628 *
2629 * Fortunately this does not happen during normal operation, but
2630 * only if the filesystem is very low on space and has to dip into
2631 * the AGFL for normal allocations.
2632 */
2633 if (userdata)
2634 goto out_force_log;
2635
2636 if (bbno < fbno && bend > fend) {
2637 /*
2638 * Case 1:
2639 * bbno bend
2640 * +BBBBBBBBBBBBBBBBB+
2641 * +---------+
2642 * fbno fend
2643 */
2644
2645 /*
2646 * We would have to split the busy extent to be able to track
2647 * it correct, which we cannot do because we would have to
2648 * modify the list of busy extents attached to the transaction
2649 * or CIL context, which is immutable.
2650 *
2651 * Force out the log to clear the busy extent and retry the
2652 * search.
2653 */
2654 goto out_force_log;
2655 } else if (bbno >= fbno && bend <= fend) {
2656 /*
2657 * Case 2:
2658 * bbno bend
2659 * +BBBBBBBBBBBBBBBBB+
2660 * +-----------------+
2661 * fbno fend
2662 *
2663 * Case 3:
2664 * bbno bend
2665 * +BBBBBBBBBBBBBBBBB+
2666 * +--------------------------+
2667 * fbno fend
2668 *
2669 * Case 4:
2670 * bbno bend
2671 * +BBBBBBBBBBBBBBBBB+
2672 * +--------------------------+
2673 * fbno fend
2674 *
2675 * Case 5:
2676 * bbno bend
2677 * +BBBBBBBBBBBBBBBBB+
2678 * +-----------------------------------+
2679 * fbno fend
2680 *
2681 */
2682
2683 /*
2684 * The busy extent is fully covered by the extent we are
2685 * allocating, and can simply be removed from the rbtree.
2686 * However we cannot remove it from the immutable list
2687 * tracking busy extents in the transaction or CIL context,
2688 * so set the length to zero to mark it invalid.
2689 *
2690 * We also need to restart the busy extent search from the
2691 * tree root, because erasing the node can rearrange the
2692 * tree topology.
2693 */
2694 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2695 busyp->length = 0;
2696 return false;
2697 } else if (fend < bend) {
2698 /*
2699 * Case 6:
2700 * bbno bend
2701 * +BBBBBBBBBBBBBBBBB+
2702 * +---------+
2703 * fbno fend
2704 *
2705 * Case 7:
2706 * bbno bend
2707 * +BBBBBBBBBBBBBBBBB+
2708 * +------------------+
2709 * fbno fend
2710 *
2711 */
2712 busyp->bno = fend;
2713 } else if (bbno < fbno) {
2714 /*
2715 * Case 8:
2716 * bbno bend
2717 * +BBBBBBBBBBBBBBBBB+
2718 * +-------------+
2719 * fbno fend
2720 *
2721 * Case 9:
2722 * bbno bend
2723 * +BBBBBBBBBBBBBBBBB+
2724 * +----------------------+
2725 * fbno fend
2726 */
2727 busyp->length = fbno - busyp->bno;
2728 } else {
2729 ASSERT(0);
2730 }
2731
2732 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2733 return true;
2734
2735out_force_log:
2736 spin_unlock(&pag->pagb_lock);
2737 xfs_log_force(mp, XFS_LOG_SYNC);
2738 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2739 spin_lock(&pag->pagb_lock);
2740 return false;
2741}
2742
2743
2744/*
2745 * For a given extent [fbno, flen], make sure we can reuse it safely.
2746 */
2656void 2747void
2657xfs_alloc_busy_clear( 2748xfs_alloc_busy_reuse(
2658 struct xfs_mount *mp, 2749 struct xfs_mount *mp,
2659 struct xfs_busy_extent *busyp) 2750 xfs_agnumber_t agno,
2751 xfs_agblock_t fbno,
2752 xfs_extlen_t flen,
2753 bool userdata)
2660{ 2754{
2661 struct xfs_perag *pag; 2755 struct xfs_perag *pag;
2756 struct rb_node *rbp;
2662 2757
2663 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, 2758 ASSERT(flen > 0);
2664 busyp->length);
2665 2759
2666 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, 2760 pag = xfs_perag_get(mp, agno);
2667 busyp->length) == 1); 2761 spin_lock(&pag->pagb_lock);
2762restart:
2763 rbp = pag->pagb_tree.rb_node;
2764 while (rbp) {
2765 struct xfs_busy_extent *busyp =
2766 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2767 xfs_agblock_t bbno = busyp->bno;
2768 xfs_agblock_t bend = bbno + busyp->length;
2668 2769
2669 list_del_init(&busyp->list); 2770 if (fbno + flen <= bbno) {
2771 rbp = rbp->rb_left;
2772 continue;
2773 } else if (fbno >= bend) {
2774 rbp = rbp->rb_right;
2775 continue;
2776 }
2670 2777
2671 pag = xfs_perag_get(mp, busyp->agno); 2778 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2672 spin_lock(&pag->pagb_lock); 2779 userdata))
2673 rb_erase(&busyp->rb_node, &pag->pagb_tree); 2780 goto restart;
2781 }
2674 spin_unlock(&pag->pagb_lock); 2782 spin_unlock(&pag->pagb_lock);
2675 xfs_perag_put(pag); 2783 xfs_perag_put(pag);
2784}
2785
2786/*
2787 * For a given extent [fbno, flen], search the busy extent list to find a
2788 * subset of the extent that is not busy. If *rlen is smaller than
2789 * args->minlen no suitable extent could be found, and the higher level
2790 * code needs to force out the log and retry the allocation.
2791 */
2792STATIC void
2793xfs_alloc_busy_trim(
2794 struct xfs_alloc_arg *args,
2795 xfs_agblock_t bno,
2796 xfs_extlen_t len,
2797 xfs_agblock_t *rbno,
2798 xfs_extlen_t *rlen)
2799{
2800 xfs_agblock_t fbno;
2801 xfs_extlen_t flen;
2802 struct rb_node *rbp;
2803
2804 ASSERT(len > 0);
2805
2806 spin_lock(&args->pag->pagb_lock);
2807restart:
2808 fbno = bno;
2809 flen = len;
2810 rbp = args->pag->pagb_tree.rb_node;
2811 while (rbp && flen >= args->minlen) {
2812 struct xfs_busy_extent *busyp =
2813 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2814 xfs_agblock_t fend = fbno + flen;
2815 xfs_agblock_t bbno = busyp->bno;
2816 xfs_agblock_t bend = bbno + busyp->length;
2817
2818 if (fend <= bbno) {
2819 rbp = rbp->rb_left;
2820 continue;
2821 } else if (fbno >= bend) {
2822 rbp = rbp->rb_right;
2823 continue;
2824 }
2825
2826 /*
2827 * If this is a metadata allocation, try to reuse the busy
2828 * extent instead of trimming the allocation.
2829 */
2830 if (!args->userdata &&
2831 !(busyp->flags & XFS_ALLOC_BUSY_DISCARDED)) {
2832 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2833 busyp, fbno, flen,
2834 false))
2835 goto restart;
2836 continue;
2837 }
2838
2839 if (bbno <= fbno) {
2840 /* start overlap */
2841
2842 /*
2843 * Case 1:
2844 * bbno bend
2845 * +BBBBBBBBBBBBBBBBB+
2846 * +---------+
2847 * fbno fend
2848 *
2849 * Case 2:
2850 * bbno bend
2851 * +BBBBBBBBBBBBBBBBB+
2852 * +-------------+
2853 * fbno fend
2854 *
2855 * Case 3:
2856 * bbno bend
2857 * +BBBBBBBBBBBBBBBBB+
2858 * +-------------+
2859 * fbno fend
2860 *
2861 * Case 4:
2862 * bbno bend
2863 * +BBBBBBBBBBBBBBBBB+
2864 * +-----------------+
2865 * fbno fend
2866 *
2867 * No unbusy region in extent, return failure.
2868 */
2869 if (fend <= bend)
2870 goto fail;
2871
2872 /*
2873 * Case 5:
2874 * bbno bend
2875 * +BBBBBBBBBBBBBBBBB+
2876 * +----------------------+
2877 * fbno fend
2878 *
2879 * Case 6:
2880 * bbno bend
2881 * +BBBBBBBBBBBBBBBBB+
2882 * +--------------------------+
2883 * fbno fend
2884 *
2885 * Needs to be trimmed to:
2886 * +-------+
2887 * fbno fend
2888 */
2889 fbno = bend;
2890 } else if (bend >= fend) {
2891 /* end overlap */
2892
2893 /*
2894 * Case 7:
2895 * bbno bend
2896 * +BBBBBBBBBBBBBBBBB+
2897 * +------------------+
2898 * fbno fend
2899 *
2900 * Case 8:
2901 * bbno bend
2902 * +BBBBBBBBBBBBBBBBB+
2903 * +--------------------------+
2904 * fbno fend
2905 *
2906 * Needs to be trimmed to:
2907 * +-------+
2908 * fbno fend
2909 */
2910 fend = bbno;
2911 } else {
2912 /* middle overlap */
2913
2914 /*
2915 * Case 9:
2916 * bbno bend
2917 * +BBBBBBBBBBBBBBBBB+
2918 * +-----------------------------------+
2919 * fbno fend
2920 *
2921 * Can be trimmed to:
2922 * +-------+ OR +-------+
2923 * fbno fend fbno fend
2924 *
2925 * Backward allocation leads to significant
2926 * fragmentation of directories, which degrades
2927 * directory performance, therefore we always want to
2928 * choose the option that produces forward allocation
2929 * patterns.
2930 * Preferring the lower bno extent will make the next
2931 * request use "fend" as the start of the next
2932 * allocation; if the segment is no longer busy at
2933 * that point, we'll get a contiguous allocation, but
2934 * even if it is still busy, we will get a forward
2935 * allocation.
2936 * We try to avoid choosing the segment at "bend",
2937 * because that can lead to the next allocation
2938 * taking the segment at "fbno", which would be a
2939 * backward allocation. We only use the segment at
2940 * "fbno" if it is much larger than the current
2941 * requested size, because in that case there's a
2942 * good chance subsequent allocations will be
2943 * contiguous.
2944 */
2945 if (bbno - fbno >= args->maxlen) {
2946 /* left candidate fits perfect */
2947 fend = bbno;
2948 } else if (fend - bend >= args->maxlen * 4) {
2949 /* right candidate has enough free space */
2950 fbno = bend;
2951 } else if (bbno - fbno >= args->minlen) {
2952 /* left candidate fits minimum requirement */
2953 fend = bbno;
2954 } else {
2955 goto fail;
2956 }
2957 }
2958
2959 flen = fend - fbno;
2960 }
2961 spin_unlock(&args->pag->pagb_lock);
2962
2963 if (fbno != bno || flen != len) {
2964 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
2965 fbno, flen);
2966 }
2967 *rbno = fbno;
2968 *rlen = flen;
2969 return;
2970fail:
2971 /*
2972 * Return a zero extent length as failure indications. All callers
2973 * re-check if the trimmed extent satisfies the minlen requirement.
2974 */
2975 spin_unlock(&args->pag->pagb_lock);
2976 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
2977 *rbno = fbno;
2978 *rlen = 0;
2979}
2980
2981static void
2982xfs_alloc_busy_clear_one(
2983 struct xfs_mount *mp,
2984 struct xfs_perag *pag,
2985 struct xfs_busy_extent *busyp)
2986{
2987 if (busyp->length) {
2988 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
2989 busyp->length);
2990 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2991 }
2676 2992
2993 list_del_init(&busyp->list);
2677 kmem_free(busyp); 2994 kmem_free(busyp);
2678} 2995}
2996
2997/*
2998 * Remove all extents on the passed in list from the busy extents tree.
2999 * If do_discard is set skip extents that need to be discarded, and mark
3000 * these as undergoing a discard operation instead.
3001 */
3002void
3003xfs_alloc_busy_clear(
3004 struct xfs_mount *mp,
3005 struct list_head *list,
3006 bool do_discard)
3007{
3008 struct xfs_busy_extent *busyp, *n;
3009 struct xfs_perag *pag = NULL;
3010 xfs_agnumber_t agno = NULLAGNUMBER;
3011
3012 list_for_each_entry_safe(busyp, n, list, list) {
3013 if (busyp->agno != agno) {
3014 if (pag) {
3015 spin_unlock(&pag->pagb_lock);
3016 xfs_perag_put(pag);
3017 }
3018 pag = xfs_perag_get(mp, busyp->agno);
3019 spin_lock(&pag->pagb_lock);
3020 agno = busyp->agno;
3021 }
3022
3023 if (do_discard && busyp->length &&
3024 !(busyp->flags & XFS_ALLOC_BUSY_SKIP_DISCARD))
3025 busyp->flags = XFS_ALLOC_BUSY_DISCARDED;
3026 else
3027 xfs_alloc_busy_clear_one(mp, pag, busyp);
3028 }
3029
3030 if (pag) {
3031 spin_unlock(&pag->pagb_lock);
3032 xfs_perag_put(pag);
3033 }
3034}
3035
3036/*
3037 * Callback for list_sort to sort busy extents by the AG they reside in.
3038 */
3039int
3040xfs_busy_extent_ag_cmp(
3041 void *priv,
3042 struct list_head *a,
3043 struct list_head *b)
3044{
3045 return container_of(a, struct xfs_busy_extent, list)->agno -
3046 container_of(b, struct xfs_busy_extent, list)->agno;
3047}