aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c844
1 files changed, 584 insertions, 260 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 27d64d752eab..acdced86413c 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -41,19 +41,13 @@
41#define XFSA_FIXUP_BNO_OK 1 41#define XFSA_FIXUP_BNO_OK 1
42#define XFSA_FIXUP_CNT_OK 2 42#define XFSA_FIXUP_CNT_OK 2
43 43
44/*
45 * Prototypes for per-ag allocation routines
46 */
47
48STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 44STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
49STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 45STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
50STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 46STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
51STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, 47STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
52 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); 48 xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
53 49STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *,
54/* 50 xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *);
55 * Internal functions.
56 */
57 51
58/* 52/*
59 * Lookup the record equal to [bno, len] in the btree given by cur. 53 * Lookup the record equal to [bno, len] in the btree given by cur.
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned(
154 xfs_extlen_t *reslen) /* result length */ 148 xfs_extlen_t *reslen) /* result length */
155{ 149{
156 xfs_agblock_t bno; 150 xfs_agblock_t bno;
157 xfs_extlen_t diff;
158 xfs_extlen_t len; 151 xfs_extlen_t len;
159 152
160 if (args->alignment > 1 && foundlen >= args->minlen) { 153 /* Trim busy sections out of found extent */
161 bno = roundup(foundbno, args->alignment); 154 xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len);
162 diff = bno - foundbno; 155
163 len = diff >= foundlen ? 0 : foundlen - diff; 156 if (args->alignment > 1 && len >= args->minlen) {
157 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
158 xfs_extlen_t diff = aligned_bno - bno;
159
160 *resbno = aligned_bno;
161 *reslen = diff >= len ? 0 : len - diff;
164 } else { 162 } else {
165 bno = foundbno; 163 *resbno = bno;
166 len = foundlen; 164 *reslen = len;
167 } 165 }
168 *resbno = bno;
169 *reslen = len;
170} 166}
171 167
172/* 168/*
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft(
280 return 1; 276 return 1;
281 agf = XFS_BUF_TO_AGF(args->agbp); 277 agf = XFS_BUF_TO_AGF(args->agbp);
282 diff = be32_to_cpu(agf->agf_freeblks) 278 diff = be32_to_cpu(agf->agf_freeblks)
283 + be32_to_cpu(agf->agf_flcount)
284 - args->len - args->minleft; 279 - args->len - args->minleft;
285 if (diff >= 0) 280 if (diff >= 0)
286 return 1; 281 return 1;
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent(
541 if (error) 536 if (error)
542 return error; 537 return error;
543 538
544 /* 539 ASSERT(!xfs_alloc_busy_search(args->mp, args->agno,
545 * Search the busylist for these blocks and mark the 540 args->agbno, args->len));
546 * transaction as synchronous if blocks are found. This
547 * avoids the need to block due to a synchronous log
548 * force to ensure correct ordering as the synchronous
549 * transaction will guarantee that for us.
550 */
551 if (xfs_alloc_busy_search(args->mp, args->agno,
552 args->agbno, args->len))
553 xfs_trans_set_sync(args->tp);
554 } 541 }
555 542
556 if (!args->isfl) { 543 if (!args->isfl) {
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact(
577{ 564{
578 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ 565 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
579 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ 566 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
580 xfs_agblock_t end; /* end of allocated extent */
581 int error; 567 int error;
582 xfs_agblock_t fbno; /* start block of found extent */ 568 xfs_agblock_t fbno; /* start block of found extent */
583 xfs_agblock_t fend; /* end block of found extent */
584 xfs_extlen_t flen; /* length of found extent */ 569 xfs_extlen_t flen; /* length of found extent */
570 xfs_agblock_t tbno; /* start block of trimmed extent */
571 xfs_extlen_t tlen; /* length of trimmed extent */
572 xfs_agblock_t tend; /* end block of trimmed extent */
573 xfs_agblock_t end; /* end of allocated extent */
585 int i; /* success/failure of operation */ 574 int i; /* success/failure of operation */
586 xfs_agblock_t maxend; /* end of maximal extent */
587 xfs_agblock_t minend; /* end of minimal extent */
588 xfs_extlen_t rlen; /* length of returned extent */ 575 xfs_extlen_t rlen; /* length of returned extent */
589 576
590 ASSERT(args->alignment == 1); 577 ASSERT(args->alignment == 1);
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact(
614 goto error0; 601 goto error0;
615 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 602 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
616 ASSERT(fbno <= args->agbno); 603 ASSERT(fbno <= args->agbno);
617 minend = args->agbno + args->minlen;
618 maxend = args->agbno + args->maxlen;
619 fend = fbno + flen;
620 604
621 /* 605 /*
622 * Give up if the freespace isn't long enough for the minimum request. 606 * Check for overlapping busy extents.
607 */
608 xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen);
609
610 /*
611 * Give up if the start of the extent is busy, or the freespace isn't
612 * long enough for the minimum request.
623 */ 613 */
624 if (fend < minend) 614 if (tbno > args->agbno)
615 goto not_found;
616 if (tlen < args->minlen)
617 goto not_found;
618 tend = tbno + tlen;
619 if (tend < args->agbno + args->minlen)
625 goto not_found; 620 goto not_found;
626 621
627 /* 622 /*
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact(
630 * 625 *
631 * Fix the length according to mod and prod if given. 626 * Fix the length according to mod and prod if given.
632 */ 627 */
633 end = XFS_AGBLOCK_MIN(fend, maxend); 628 end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen);
634 args->len = end - args->agbno; 629 args->len = end - args->agbno;
635 xfs_alloc_fix_len(args); 630 xfs_alloc_fix_len(args);
636 if (!xfs_alloc_fix_minleft(args)) 631 if (!xfs_alloc_fix_minleft(args))
637 goto not_found; 632 goto not_found;
638 633
639 rlen = args->len; 634 rlen = args->len;
640 ASSERT(args->agbno + rlen <= fend); 635 ASSERT(args->agbno + rlen <= tend);
641 end = args->agbno + rlen; 636 end = args->agbno + rlen;
642 637
643 /* 638 /*
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent(
686 struct xfs_btree_cur **scur, /* searching cursor */ 681 struct xfs_btree_cur **scur, /* searching cursor */
687 xfs_agblock_t gdiff, /* difference for search comparison */ 682 xfs_agblock_t gdiff, /* difference for search comparison */
688 xfs_agblock_t *sbno, /* extent found by search */ 683 xfs_agblock_t *sbno, /* extent found by search */
689 xfs_extlen_t *slen, 684 xfs_extlen_t *slen, /* extent length */
690 xfs_extlen_t *slena, /* aligned length */ 685 xfs_agblock_t *sbnoa, /* aligned extent found by search */
686 xfs_extlen_t *slena, /* aligned extent length */
691 int dir) /* 0 = search right, 1 = search left */ 687 int dir) /* 0 = search right, 1 = search left */
692{ 688{
693 xfs_agblock_t bno;
694 xfs_agblock_t new; 689 xfs_agblock_t new;
695 xfs_agblock_t sdiff; 690 xfs_agblock_t sdiff;
696 int error; 691 int error;
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent(
708 if (error) 703 if (error)
709 goto error0; 704 goto error0;
710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 705 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
711 xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena); 706 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
712 707
713 /* 708 /*
714 * The good extent is closer than this one. 709 * The good extent is closer than this one.
715 */ 710 */
716 if (!dir) { 711 if (!dir) {
717 if (bno >= args->agbno + gdiff) 712 if (*sbnoa >= args->agbno + gdiff)
718 goto out_use_good; 713 goto out_use_good;
719 } else { 714 } else {
720 if (bno <= args->agbno - gdiff) 715 if (*sbnoa <= args->agbno - gdiff)
721 goto out_use_good; 716 goto out_use_good;
722 } 717 }
723 718
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent(
729 xfs_alloc_fix_len(args); 724 xfs_alloc_fix_len(args);
730 725
731 sdiff = xfs_alloc_compute_diff(args->agbno, args->len, 726 sdiff = xfs_alloc_compute_diff(args->agbno, args->len,
732 args->alignment, *sbno, 727 args->alignment, *sbnoa,
733 *slen, &new); 728 *slena, &new);
734 729
735 /* 730 /*
736 * Choose closer size and invalidate other cursor. 731 * Choose closer size and invalidate other cursor.
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near(
780 xfs_agblock_t gtbnoa; /* aligned ... */ 775 xfs_agblock_t gtbnoa; /* aligned ... */
781 xfs_extlen_t gtdiff; /* difference to right side entry */ 776 xfs_extlen_t gtdiff; /* difference to right side entry */
782 xfs_extlen_t gtlen; /* length of right side entry */ 777 xfs_extlen_t gtlen; /* length of right side entry */
783 xfs_extlen_t gtlena = 0; /* aligned ... */ 778 xfs_extlen_t gtlena; /* aligned ... */
784 xfs_agblock_t gtnew; /* useful start bno of right side */ 779 xfs_agblock_t gtnew; /* useful start bno of right side */
785 int error; /* error code */ 780 int error; /* error code */
786 int i; /* result code, temporary */ 781 int i; /* result code, temporary */
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near(
789 xfs_agblock_t ltbnoa; /* aligned ... */ 784 xfs_agblock_t ltbnoa; /* aligned ... */
790 xfs_extlen_t ltdiff; /* difference to left side entry */ 785 xfs_extlen_t ltdiff; /* difference to left side entry */
791 xfs_extlen_t ltlen; /* length of left side entry */ 786 xfs_extlen_t ltlen; /* length of left side entry */
792 xfs_extlen_t ltlena = 0; /* aligned ... */ 787 xfs_extlen_t ltlena; /* aligned ... */
793 xfs_agblock_t ltnew; /* useful start bno of left side */ 788 xfs_agblock_t ltnew; /* useful start bno of left side */
794 xfs_extlen_t rlen; /* length of returned extent */ 789 xfs_extlen_t rlen; /* length of returned extent */
790 int forced = 0;
795#if defined(DEBUG) && defined(__KERNEL__) 791#if defined(DEBUG) && defined(__KERNEL__)
796 /* 792 /*
797 * Randomly don't execute the first algorithm. 793 * Randomly don't execute the first algorithm.
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near(
800 796
801 dofirst = random32() & 1; 797 dofirst = random32() & 1;
802#endif 798#endif
799
800restart:
801 bno_cur_lt = NULL;
802 bno_cur_gt = NULL;
803 ltlen = 0;
804 gtlena = 0;
805 ltlena = 0;
806
803 /* 807 /*
804 * Get a cursor for the by-size btree. 808 * Get a cursor for the by-size btree.
805 */ 809 */
806 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 810 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
807 args->agno, XFS_BTNUM_CNT); 811 args->agno, XFS_BTNUM_CNT);
808 ltlen = 0; 812
809 bno_cur_lt = bno_cur_gt = NULL;
810 /* 813 /*
811 * See if there are any free extents as big as maxlen. 814 * See if there are any free extents as big as maxlen.
812 */ 815 */
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near(
822 goto error0; 825 goto error0;
823 if (i == 0 || ltlen == 0) { 826 if (i == 0 || ltlen == 0) {
824 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 827 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
828 trace_xfs_alloc_near_noentry(args);
825 return 0; 829 return 0;
826 } 830 }
827 ASSERT(i == 1); 831 ASSERT(i == 1);
828 } 832 }
829 args->wasfromfl = 0; 833 args->wasfromfl = 0;
834
830 /* 835 /*
831 * First algorithm. 836 * First algorithm.
832 * If the requested extent is large wrt the freespaces available 837 * If the requested extent is large wrt the freespaces available
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near(
890 if (args->len < blen) 895 if (args->len < blen)
891 continue; 896 continue;
892 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 897 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
893 args->alignment, ltbno, ltlen, &ltnew); 898 args->alignment, ltbnoa, ltlena, &ltnew);
894 if (ltnew != NULLAGBLOCK && 899 if (ltnew != NULLAGBLOCK &&
895 (args->len > blen || ltdiff < bdiff)) { 900 (args->len > blen || ltdiff < bdiff)) {
896 bdiff = ltdiff; 901 bdiff = ltdiff;
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near(
1042 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 1047 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
1043 xfs_alloc_fix_len(args); 1048 xfs_alloc_fix_len(args);
1044 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1049 ltdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1045 args->alignment, ltbno, ltlen, &ltnew); 1050 args->alignment, ltbnoa, ltlena, &ltnew);
1046 1051
1047 error = xfs_alloc_find_best_extent(args, 1052 error = xfs_alloc_find_best_extent(args,
1048 &bno_cur_lt, &bno_cur_gt, 1053 &bno_cur_lt, &bno_cur_gt,
1049 ltdiff, &gtbno, &gtlen, &gtlena, 1054 ltdiff, &gtbno, &gtlen,
1055 &gtbnoa, &gtlena,
1050 0 /* search right */); 1056 0 /* search right */);
1051 } else { 1057 } else {
1052 ASSERT(gtlena >= args->minlen); 1058 ASSERT(gtlena >= args->minlen);
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near(
1057 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); 1063 args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen);
1058 xfs_alloc_fix_len(args); 1064 xfs_alloc_fix_len(args);
1059 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, 1065 gtdiff = xfs_alloc_compute_diff(args->agbno, args->len,
1060 args->alignment, gtbno, gtlen, &gtnew); 1066 args->alignment, gtbnoa, gtlena, &gtnew);
1061 1067
1062 error = xfs_alloc_find_best_extent(args, 1068 error = xfs_alloc_find_best_extent(args,
1063 &bno_cur_gt, &bno_cur_lt, 1069 &bno_cur_gt, &bno_cur_lt,
1064 gtdiff, &ltbno, &ltlen, &ltlena, 1070 gtdiff, &ltbno, &ltlen,
1071 &ltbnoa, &ltlena,
1065 1 /* search left */); 1072 1 /* search left */);
1066 } 1073 }
1067 1074
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near(
1073 * If we couldn't get anything, give up. 1080 * If we couldn't get anything, give up.
1074 */ 1081 */
1075 if (bno_cur_lt == NULL && bno_cur_gt == NULL) { 1082 if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
1083 if (!forced++) {
1084 trace_xfs_alloc_near_busy(args);
1085 xfs_log_force(args->mp, XFS_LOG_SYNC);
1086 goto restart;
1087 }
1088
1076 trace_xfs_alloc_size_neither(args); 1089 trace_xfs_alloc_size_neither(args);
1077 args->agbno = NULLAGBLOCK; 1090 args->agbno = NULLAGBLOCK;
1078 return 0; 1091 return 0;
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near(
1107 return 0; 1120 return 0;
1108 } 1121 }
1109 rlen = args->len; 1122 rlen = args->len;
1110 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, 1123 (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
1111 ltlen, &ltnew); 1124 ltbnoa, ltlena, &ltnew);
1112 ASSERT(ltnew >= ltbno); 1125 ASSERT(ltnew >= ltbno);
1113 ASSERT(ltnew + rlen <= ltbno + ltlen); 1126 ASSERT(ltnew + rlen <= ltbnoa + ltlena);
1114 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1127 ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1115 args->agbno = ltnew; 1128 args->agbno = ltnew;
1129
1116 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, 1130 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen,
1117 ltnew, rlen, XFSA_FIXUP_BNO_OK))) 1131 ltnew, rlen, XFSA_FIXUP_BNO_OK)))
1118 goto error0; 1132 goto error0;
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size(
1155 int i; /* temp status variable */ 1169 int i; /* temp status variable */
1156 xfs_agblock_t rbno; /* returned block number */ 1170 xfs_agblock_t rbno; /* returned block number */
1157 xfs_extlen_t rlen; /* length of returned extent */ 1171 xfs_extlen_t rlen; /* length of returned extent */
1172 int forced = 0;
1158 1173
1174restart:
1159 /* 1175 /*
1160 * Allocate and initialize a cursor for the by-size btree. 1176 * Allocate and initialize a cursor for the by-size btree.
1161 */ 1177 */
1162 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1178 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1163 args->agno, XFS_BTNUM_CNT); 1179 args->agno, XFS_BTNUM_CNT);
1164 bno_cur = NULL; 1180 bno_cur = NULL;
1181
1165 /* 1182 /*
1166 * Look for an entry >= maxlen+alignment-1 blocks. 1183 * Look for an entry >= maxlen+alignment-1 blocks.
1167 */ 1184 */
1168 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1185 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1169 args->maxlen + args->alignment - 1, &i))) 1186 args->maxlen + args->alignment - 1, &i)))
1170 goto error0; 1187 goto error0;
1188
1171 /* 1189 /*
1172 * If none, then pick up the last entry in the tree unless the 1190 * If none or we have busy extents that we cannot allocate from, then
1173 * tree is empty. 1191 * we have to settle for a smaller extent. In the case that there are
1192 * no large extents, this will return the last entry in the tree unless
1193 * the tree is empty. In the case that there are only busy large
1194 * extents, this will return the largest small extent unless there
1195 * are no smaller extents available.
1174 */ 1196 */
1175 if (!i) { 1197 if (!i || forced > 1) {
1176 if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, 1198 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1177 &flen, &i))) 1199 &fbno, &flen, &i);
1200 if (error)
1178 goto error0; 1201 goto error0;
1179 if (i == 0 || flen == 0) { 1202 if (i == 0 || flen == 0) {
1180 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1203 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size(
1182 return 0; 1205 return 0;
1183 } 1206 }
1184 ASSERT(i == 1); 1207 ASSERT(i == 1);
1208 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1209 } else {
1210 /*
1211 * Search for a non-busy extent that is large enough.
1212 * If we are at low space, don't check, or if we fall of
1213 * the end of the btree, turn off the busy check and
1214 * restart.
1215 */
1216 for (;;) {
1217 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1218 if (error)
1219 goto error0;
1220 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1221
1222 xfs_alloc_compute_aligned(args, fbno, flen,
1223 &rbno, &rlen);
1224
1225 if (rlen >= args->maxlen)
1226 break;
1227
1228 error = xfs_btree_increment(cnt_cur, 0, &i);
1229 if (error)
1230 goto error0;
1231 if (i == 0) {
1232 /*
1233 * Our only valid extents must have been busy.
1234 * Make it unbusy by forcing the log out and
1235 * retrying. If we've been here before, forcing
1236 * the log isn't making the extents available,
1237 * which means they have probably been freed in
1238 * this transaction. In that case, we have to
1239 * give up on them and we'll attempt a minlen
1240 * allocation the next time around.
1241 */
1242 xfs_btree_del_cursor(cnt_cur,
1243 XFS_BTREE_NOERROR);
1244 trace_xfs_alloc_size_busy(args);
1245 if (!forced++)
1246 xfs_log_force(args->mp, XFS_LOG_SYNC);
1247 goto restart;
1248 }
1249 }
1185 } 1250 }
1186 /* 1251
1187 * There's a freespace as big as maxlen+alignment-1, get it.
1188 */
1189 else {
1190 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i)))
1191 goto error0;
1192 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1193 }
1194 /* 1252 /*
1195 * In the first case above, we got the last entry in the 1253 * In the first case above, we got the last entry in the
1196 * by-size btree. Now we check to see if the space hits maxlen 1254 * by-size btree. Now we check to see if the space hits maxlen
1197 * once aligned; if not, we search left for something better. 1255 * once aligned; if not, we search left for something better.
1198 * This can't happen in the second case above. 1256 * This can't happen in the second case above.
1199 */ 1257 */
1200 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1201 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1258 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1202 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1259 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1203 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1260 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size(
1251 * Fix up the length. 1308 * Fix up the length.
1252 */ 1309 */
1253 args->len = rlen; 1310 args->len = rlen;
1254 xfs_alloc_fix_len(args); 1311 if (rlen < args->minlen) {
1255 if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { 1312 if (!forced++) {
1256 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1313 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1257 trace_xfs_alloc_size_nominleft(args); 1314 trace_xfs_alloc_size_busy(args);
1258 args->agbno = NULLAGBLOCK; 1315 xfs_log_force(args->mp, XFS_LOG_SYNC);
1259 return 0; 1316 goto restart;
1317 }
1318 goto out_nominleft;
1260 } 1319 }
1320 xfs_alloc_fix_len(args);
1321
1322 if (!xfs_alloc_fix_minleft(args))
1323 goto out_nominleft;
1261 rlen = args->len; 1324 rlen = args->len;
1262 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); 1325 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0);
1263 /* 1326 /*
@@ -1287,6 +1350,12 @@ error0:
1287 if (bno_cur) 1350 if (bno_cur)
1288 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1351 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1289 return error; 1352 return error;
1353
1354out_nominleft:
1355 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1356 trace_xfs_alloc_size_nominleft(args);
1357 args->agbno = NULLAGBLOCK;
1358 return 0;
1290} 1359}
1291 1360
1292/* 1361/*
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small(
1326 if (error) 1395 if (error)
1327 goto error0; 1396 goto error0;
1328 if (fbno != NULLAGBLOCK) { 1397 if (fbno != NULLAGBLOCK) {
1398 xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1,
1399 args->userdata);
1400
1329 if (args->userdata) { 1401 if (args->userdata) {
1330 xfs_buf_t *bp; 1402 xfs_buf_t *bp;
1331 1403
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent(
1617 1689
1618 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1690 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1619 1691
1620 /*
1621 * Since blocks move to the free list without the coordination
1622 * used in xfs_bmap_finish, we can't allow block to be available
1623 * for reallocation and non-transaction writing (user data)
1624 * until we know that the transaction that moved it to the free
1625 * list is permanently on disk. We track the blocks by declaring
1626 * these blocks as "busy"; the busy list is maintained on a per-ag
1627 * basis and each transaction records which entries should be removed
1628 * when the iclog commits to disk. If a busy block is allocated,
1629 * the iclog is pushed up to the LSN that freed the block.
1630 */
1631 xfs_alloc_busy_insert(tp, agno, bno, len);
1632 return 0; 1692 return 0;
1633 1693
1634 error0: 1694 error0:
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist(
1923 xfs_alloc_log_agf(tp, agbp, logflags); 1983 xfs_alloc_log_agf(tp, agbp, logflags);
1924 *bnop = bno; 1984 *bnop = bno;
1925 1985
1926 /*
1927 * As blocks are freed, they are added to the per-ag busy list and
1928 * remain there until the freeing transaction is committed to disk.
1929 * Now that we have allocated blocks, this list must be searched to see
1930 * if a block is being reused. If one is, then the freeing transaction
1931 * must be pushed to disk before this transaction.
1932 *
1933 * We do this by setting the current transaction to a sync transaction
1934 * which guarantees that the freeing transaction is on disk before this
1935 * transaction. This is done instead of a synchronous log force here so
1936 * that we don't sit and wait with the AGF locked in the transaction
1937 * during the log force.
1938 */
1939 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
1940 xfs_trans_set_sync(tp);
1941 return 0; 1986 return 0;
1942} 1987}
1943 1988
@@ -2423,105 +2468,13 @@ xfs_free_extent(
2423 } 2468 }
2424 2469
2425 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2470 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2471 if (!error)
2472 xfs_alloc_busy_insert(tp, args.agno, args.agbno, len);
2426error0: 2473error0:
2427 xfs_perag_put(args.pag); 2474 xfs_perag_put(args.pag);
2428 return error; 2475 return error;
2429} 2476}
2430 2477
2431
2432/*
2433 * AG Busy list management
2434 * The busy list contains block ranges that have been freed but whose
2435 * transactions have not yet hit disk. If any block listed in a busy
2436 * list is reused, the transaction that freed it must be forced to disk
2437 * before continuing to use the block.
2438 *
2439 * xfs_alloc_busy_insert - add to the per-ag busy list
2440 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2441 * xfs_alloc_busy_search - search for a busy extent
2442 */
2443
2444/*
2445 * Insert a new extent into the busy tree.
2446 *
2447 * The busy extent tree is indexed by the start block of the busy extent.
2448 * there can be multiple overlapping ranges in the busy extent tree but only
2449 * ever one entry at a given start block. The reason for this is that
2450 * multi-block extents can be freed, then smaller chunks of that extent
2451 * allocated and freed again before the first transaction commit is on disk.
2452 * If the exact same start block is freed a second time, we have to wait for
2453 * that busy extent to pass out of the tree before the new extent is inserted.
2454 * There are two main cases we have to handle here.
2455 *
2456 * The first case is a transaction that triggers a "free - allocate - free"
2457 * cycle. This can occur during btree manipulations as a btree block is freed
2458 * to the freelist, then allocated from the free list, then freed again. In
2459 * this case, the second extxpnet free is what triggers the duplicate and as
2460 * such the transaction IDs should match. Because the extent was allocated in
2461 * this transaction, the transaction must be marked as synchronous. This is
2462 * true for all cases where the free/alloc/free occurs in the one transaction,
2463 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2464 * This serves to catch violations of the second case quite effectively.
2465 *
2466 * The second case is where the free/alloc/free occur in different
2467 * transactions. In this case, the thread freeing the extent the second time
2468 * can't mark the extent busy immediately because it is already tracked in a
2469 * transaction that may be committing. When the log commit for the existing
2470 * busy extent completes, the busy extent will be removed from the tree. If we
2471 * allow the second busy insert to continue using that busy extent structure,
2472 * it can be freed before this transaction is safely in the log. Hence our
2473 * only option in this case is to force the log to remove the existing busy
2474 * extent from the list before we insert the new one with the current
2475 * transaction ID.
2476 *
2477 * The problem we are trying to avoid in the free-alloc-free in separate
2478 * transactions is most easily described with a timeline:
2479 *
2480 * Thread 1 Thread 2 Thread 3 xfslogd
2481 * xact alloc
2482 * free X
2483 * mark busy
2484 * commit xact
2485 * free xact
2486 * xact alloc
2487 * alloc X
2488 * busy search
2489 * mark xact sync
2490 * commit xact
2491 * free xact
2492 * force log
2493 * checkpoint starts
2494 * ....
2495 * xact alloc
2496 * free X
2497 * mark busy
2498 * finds match
2499 * *** KABOOM! ***
2500 * ....
2501 * log IO completes
2502 * unbusy X
2503 * checkpoint completes
2504 *
2505 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2506 * the checkpoint completes, and the busy extent it matched will have been
2507 * removed from the tree when it is woken. Hence it can then continue safely.
2508 *
2509 * However, to ensure this matching process is robust, we need to use the
2510 * transaction ID for identifying transaction, as delayed logging results in
2511 * the busy extent and transaction lifecycles being different. i.e. the busy
2512 * extent is active for a lot longer than the transaction. Hence the
2513 * transaction structure can be freed and reallocated, then mark the same
2514 * extent busy again in the new transaction. In this case the new transaction
2515 * will have a different tid but can have the same address, and hence we need
2516 * to check against the tid.
2517 *
2518 * Future: for delayed logging, we could avoid the log force if the extent was
2519 * first freed in the current checkpoint sequence. This, however, requires the
2520 * ability to pin the current checkpoint in memory until this transaction
2521 * commits to ensure that both the original free and the current one combine
2522 * logically into the one checkpoint. If the checkpoint sequences are
2523 * different, however, we still need to wait on a log force.
2524 */
2525void 2478void
2526xfs_alloc_busy_insert( 2479xfs_alloc_busy_insert(
2527 struct xfs_trans *tp, 2480 struct xfs_trans *tp,
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert(
2533 struct xfs_busy_extent *busyp; 2486 struct xfs_busy_extent *busyp;
2534 struct xfs_perag *pag; 2487 struct xfs_perag *pag;
2535 struct rb_node **rbp; 2488 struct rb_node **rbp;
2536 struct rb_node *parent; 2489 struct rb_node *parent = NULL;
2537 int match;
2538
2539 2490
2540 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); 2491 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2541 if (!new) { 2492 if (!new) {
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert(
2544 * block, make this a synchronous transaction to insure that 2495 * block, make this a synchronous transaction to insure that
2545 * the block is not reused before this transaction commits. 2496 * the block is not reused before this transaction commits.
2546 */ 2497 */
2547 trace_xfs_alloc_busy(tp, agno, bno, len, 1); 2498 trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len);
2548 xfs_trans_set_sync(tp); 2499 xfs_trans_set_sync(tp);
2549 return; 2500 return;
2550 } 2501 }
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert(
2552 new->agno = agno; 2503 new->agno = agno;
2553 new->bno = bno; 2504 new->bno = bno;
2554 new->length = len; 2505 new->length = len;
2555 new->tid = xfs_log_get_trans_ident(tp);
2556
2557 INIT_LIST_HEAD(&new->list); 2506 INIT_LIST_HEAD(&new->list);
2558 2507
2559 /* trace before insert to be able to see failed inserts */ 2508 /* trace before insert to be able to see failed inserts */
2560 trace_xfs_alloc_busy(tp, agno, bno, len, 0); 2509 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len);
2561 2510
2562 pag = xfs_perag_get(tp->t_mountp, new->agno); 2511 pag = xfs_perag_get(tp->t_mountp, new->agno);
2563restart:
2564 spin_lock(&pag->pagb_lock); 2512 spin_lock(&pag->pagb_lock);
2565 rbp = &pag->pagb_tree.rb_node; 2513 rbp = &pag->pagb_tree.rb_node;
2566 parent = NULL; 2514 while (*rbp) {
2567 busyp = NULL;
2568 match = 0;
2569 while (*rbp && match >= 0) {
2570 parent = *rbp; 2515 parent = *rbp;
2571 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); 2516 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2572 2517
2573 if (new->bno < busyp->bno) { 2518 if (new->bno < busyp->bno) {
2574 /* may overlap, but exact start block is lower */
2575 rbp = &(*rbp)->rb_left; 2519 rbp = &(*rbp)->rb_left;
2576 if (new->bno + new->length > busyp->bno) 2520 ASSERT(new->bno + new->length <= busyp->bno);
2577 match = busyp->tid == new->tid ? 1 : -1;
2578 } else if (new->bno > busyp->bno) { 2521 } else if (new->bno > busyp->bno) {
2579 /* may overlap, but exact start block is higher */
2580 rbp = &(*rbp)->rb_right; 2522 rbp = &(*rbp)->rb_right;
2581 if (bno < busyp->bno + busyp->length) 2523 ASSERT(bno >= busyp->bno + busyp->length);
2582 match = busyp->tid == new->tid ? 1 : -1;
2583 } else { 2524 } else {
2584 match = busyp->tid == new->tid ? 1 : -1; 2525 ASSERT(0);
2585 break;
2586 } 2526 }
2587 } 2527 }
2588 if (match < 0) {
2589 /* overlap marked busy in different transaction */
2590 spin_unlock(&pag->pagb_lock);
2591 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2592 goto restart;
2593 }
2594 if (match > 0) {
2595 /*
2596 * overlap marked busy in same transaction. Update if exact
2597 * start block match, otherwise combine the busy extents into
2598 * a single range.
2599 */
2600 if (busyp->bno == new->bno) {
2601 busyp->length = max(busyp->length, new->length);
2602 spin_unlock(&pag->pagb_lock);
2603 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2604 xfs_perag_put(pag);
2605 kmem_free(new);
2606 return;
2607 }
2608 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2609 new->length = max(busyp->bno + busyp->length,
2610 new->bno + new->length) -
2611 min(busyp->bno, new->bno);
2612 new->bno = min(busyp->bno, new->bno);
2613 } else
2614 busyp = NULL;
2615 2528
2616 rb_link_node(&new->rb_node, parent, rbp); 2529 rb_link_node(&new->rb_node, parent, rbp);
2617 rb_insert_color(&new->rb_node, &pag->pagb_tree); 2530 rb_insert_color(&new->rb_node, &pag->pagb_tree);
@@ -2619,7 +2532,6 @@ restart:
2619 list_add(&new->list, &tp->t_busy); 2532 list_add(&new->list, &tp->t_busy);
2620 spin_unlock(&pag->pagb_lock); 2533 spin_unlock(&pag->pagb_lock);
2621 xfs_perag_put(pag); 2534 xfs_perag_put(pag);
2622 kmem_free(busyp);
2623} 2535}
2624 2536
2625/* 2537/*
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search(
2668 } 2580 }
2669 } 2581 }
2670 spin_unlock(&pag->pagb_lock); 2582 spin_unlock(&pag->pagb_lock);
2671 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2672 xfs_perag_put(pag); 2583 xfs_perag_put(pag);
2673 return match; 2584 return match;
2674} 2585}
2675 2586
2587/*
2588 * The found free extent [fbno, fend] overlaps part or all of the given busy
2589 * extent. If the overlap covers the beginning, the end, or all of the busy
2590 * extent, the overlapping portion can be made unbusy and used for the
2591 * allocation. We can't split a busy extent because we can't modify a
2592 * transaction/CIL context busy list, but we can update an entries block
2593 * number or length.
2594 *
2595 * Returns true if the extent can safely be reused, or false if the search
2596 * needs to be restarted.
2597 */
2598STATIC bool
2599xfs_alloc_busy_update_extent(
2600 struct xfs_mount *mp,
2601 struct xfs_perag *pag,
2602 struct xfs_busy_extent *busyp,
2603 xfs_agblock_t fbno,
2604 xfs_extlen_t flen,
2605 bool userdata)
2606{
2607 xfs_agblock_t fend = fbno + flen;
2608 xfs_agblock_t bbno = busyp->bno;
2609 xfs_agblock_t bend = bbno + busyp->length;
2610
2611 /*
2612 * If there is a busy extent overlapping a user allocation, we have
2613 * no choice but to force the log and retry the search.
2614 *
2615 * Fortunately this does not happen during normal operation, but
2616 * only if the filesystem is very low on space and has to dip into
2617 * the AGFL for normal allocations.
2618 */
2619 if (userdata)
2620 goto out_force_log;
2621
2622 if (bbno < fbno && bend > fend) {
2623 /*
2624 * Case 1:
2625 * bbno bend
2626 * +BBBBBBBBBBBBBBBBB+
2627 * +---------+
2628 * fbno fend
2629 */
2630
2631 /*
2632 * We would have to split the busy extent to be able to track
2633 * it correct, which we cannot do because we would have to
2634 * modify the list of busy extents attached to the transaction
2635 * or CIL context, which is immutable.
2636 *
2637 * Force out the log to clear the busy extent and retry the
2638 * search.
2639 */
2640 goto out_force_log;
2641 } else if (bbno >= fbno && bend <= fend) {
2642 /*
2643 * Case 2:
2644 * bbno bend
2645 * +BBBBBBBBBBBBBBBBB+
2646 * +-----------------+
2647 * fbno fend
2648 *
2649 * Case 3:
2650 * bbno bend
2651 * +BBBBBBBBBBBBBBBBB+
2652 * +--------------------------+
2653 * fbno fend
2654 *
2655 * Case 4:
2656 * bbno bend
2657 * +BBBBBBBBBBBBBBBBB+
2658 * +--------------------------+
2659 * fbno fend
2660 *
2661 * Case 5:
2662 * bbno bend
2663 * +BBBBBBBBBBBBBBBBB+
2664 * +-----------------------------------+
2665 * fbno fend
2666 *
2667 */
2668
2669 /*
2670 * The busy extent is fully covered by the extent we are
2671 * allocating, and can simply be removed from the rbtree.
2672 * However we cannot remove it from the immutable list
2673 * tracking busy extents in the transaction or CIL context,
2674 * so set the length to zero to mark it invalid.
2675 *
2676 * We also need to restart the busy extent search from the
2677 * tree root, because erasing the node can rearrange the
2678 * tree topology.
2679 */
2680 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2681 busyp->length = 0;
2682 return false;
2683 } else if (fend < bend) {
2684 /*
2685 * Case 6:
2686 * bbno bend
2687 * +BBBBBBBBBBBBBBBBB+
2688 * +---------+
2689 * fbno fend
2690 *
2691 * Case 7:
2692 * bbno bend
2693 * +BBBBBBBBBBBBBBBBB+
2694 * +------------------+
2695 * fbno fend
2696 *
2697 */
2698 busyp->bno = fend;
2699 } else if (bbno < fbno) {
2700 /*
2701 * Case 8:
2702 * bbno bend
2703 * +BBBBBBBBBBBBBBBBB+
2704 * +-------------+
2705 * fbno fend
2706 *
2707 * Case 9:
2708 * bbno bend
2709 * +BBBBBBBBBBBBBBBBB+
2710 * +----------------------+
2711 * fbno fend
2712 */
2713 busyp->length = fbno - busyp->bno;
2714 } else {
2715 ASSERT(0);
2716 }
2717
2718 trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen);
2719 return true;
2720
2721out_force_log:
2722 spin_unlock(&pag->pagb_lock);
2723 xfs_log_force(mp, XFS_LOG_SYNC);
2724 trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen);
2725 spin_lock(&pag->pagb_lock);
2726 return false;
2727}
2728
2729
2730/*
2731 * For a given extent [fbno, flen], make sure we can reuse it safely.
2732 */
2676void 2733void
2677xfs_alloc_busy_clear( 2734xfs_alloc_busy_reuse(
2678 struct xfs_mount *mp, 2735 struct xfs_mount *mp,
2679 struct xfs_busy_extent *busyp) 2736 xfs_agnumber_t agno,
2737 xfs_agblock_t fbno,
2738 xfs_extlen_t flen,
2739 bool userdata)
2680{ 2740{
2681 struct xfs_perag *pag; 2741 struct xfs_perag *pag;
2742 struct rb_node *rbp;
2682 2743
2683 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, 2744 ASSERT(flen > 0);
2684 busyp->length);
2685 2745
2686 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, 2746 pag = xfs_perag_get(mp, agno);
2687 busyp->length) == 1); 2747 spin_lock(&pag->pagb_lock);
2748restart:
2749 rbp = pag->pagb_tree.rb_node;
2750 while (rbp) {
2751 struct xfs_busy_extent *busyp =
2752 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2753 xfs_agblock_t bbno = busyp->bno;
2754 xfs_agblock_t bend = bbno + busyp->length;
2688 2755
2689 list_del_init(&busyp->list); 2756 if (fbno + flen <= bbno) {
2757 rbp = rbp->rb_left;
2758 continue;
2759 } else if (fbno >= bend) {
2760 rbp = rbp->rb_right;
2761 continue;
2762 }
2690 2763
2691 pag = xfs_perag_get(mp, busyp->agno); 2764 if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen,
2692 spin_lock(&pag->pagb_lock); 2765 userdata))
2693 rb_erase(&busyp->rb_node, &pag->pagb_tree); 2766 goto restart;
2767 }
2694 spin_unlock(&pag->pagb_lock); 2768 spin_unlock(&pag->pagb_lock);
2695 xfs_perag_put(pag); 2769 xfs_perag_put(pag);
2770}
2771
2772/*
2773 * For a given extent [fbno, flen], search the busy extent list to find a
2774 * subset of the extent that is not busy. If *rlen is smaller than
2775 * args->minlen no suitable extent could be found, and the higher level
2776 * code needs to force out the log and retry the allocation.
2777 */
2778STATIC void
2779xfs_alloc_busy_trim(
2780 struct xfs_alloc_arg *args,
2781 xfs_agblock_t bno,
2782 xfs_extlen_t len,
2783 xfs_agblock_t *rbno,
2784 xfs_extlen_t *rlen)
2785{
2786 xfs_agblock_t fbno;
2787 xfs_extlen_t flen;
2788 struct rb_node *rbp;
2789
2790 ASSERT(len > 0);
2696 2791
2792 spin_lock(&args->pag->pagb_lock);
2793restart:
2794 fbno = bno;
2795 flen = len;
2796 rbp = args->pag->pagb_tree.rb_node;
2797 while (rbp && flen >= args->minlen) {
2798 struct xfs_busy_extent *busyp =
2799 rb_entry(rbp, struct xfs_busy_extent, rb_node);
2800 xfs_agblock_t fend = fbno + flen;
2801 xfs_agblock_t bbno = busyp->bno;
2802 xfs_agblock_t bend = bbno + busyp->length;
2803
2804 if (fend <= bbno) {
2805 rbp = rbp->rb_left;
2806 continue;
2807 } else if (fbno >= bend) {
2808 rbp = rbp->rb_right;
2809 continue;
2810 }
2811
2812 /*
2813 * If this is a metadata allocation, try to reuse the busy
2814 * extent instead of trimming the allocation.
2815 */
2816 if (!args->userdata) {
2817 if (!xfs_alloc_busy_update_extent(args->mp, args->pag,
2818 busyp, fbno, flen,
2819 false))
2820 goto restart;
2821 continue;
2822 }
2823
2824 if (bbno <= fbno) {
2825 /* start overlap */
2826
2827 /*
2828 * Case 1:
2829 * bbno bend
2830 * +BBBBBBBBBBBBBBBBB+
2831 * +---------+
2832 * fbno fend
2833 *
2834 * Case 2:
2835 * bbno bend
2836 * +BBBBBBBBBBBBBBBBB+
2837 * +-------------+
2838 * fbno fend
2839 *
2840 * Case 3:
2841 * bbno bend
2842 * +BBBBBBBBBBBBBBBBB+
2843 * +-------------+
2844 * fbno fend
2845 *
2846 * Case 4:
2847 * bbno bend
2848 * +BBBBBBBBBBBBBBBBB+
2849 * +-----------------+
2850 * fbno fend
2851 *
2852 * No unbusy region in extent, return failure.
2853 */
2854 if (fend <= bend)
2855 goto fail;
2856
2857 /*
2858 * Case 5:
2859 * bbno bend
2860 * +BBBBBBBBBBBBBBBBB+
2861 * +----------------------+
2862 * fbno fend
2863 *
2864 * Case 6:
2865 * bbno bend
2866 * +BBBBBBBBBBBBBBBBB+
2867 * +--------------------------+
2868 * fbno fend
2869 *
2870 * Needs to be trimmed to:
2871 * +-------+
2872 * fbno fend
2873 */
2874 fbno = bend;
2875 } else if (bend >= fend) {
2876 /* end overlap */
2877
2878 /*
2879 * Case 7:
2880 * bbno bend
2881 * +BBBBBBBBBBBBBBBBB+
2882 * +------------------+
2883 * fbno fend
2884 *
2885 * Case 8:
2886 * bbno bend
2887 * +BBBBBBBBBBBBBBBBB+
2888 * +--------------------------+
2889 * fbno fend
2890 *
2891 * Needs to be trimmed to:
2892 * +-------+
2893 * fbno fend
2894 */
2895 fend = bbno;
2896 } else {
2897 /* middle overlap */
2898
2899 /*
2900 * Case 9:
2901 * bbno bend
2902 * +BBBBBBBBBBBBBBBBB+
2903 * +-----------------------------------+
2904 * fbno fend
2905 *
2906 * Can be trimmed to:
2907 * +-------+ OR +-------+
2908 * fbno fend fbno fend
2909 *
2910 * Backward allocation leads to significant
2911 * fragmentation of directories, which degrades
2912 * directory performance, therefore we always want to
2913 * choose the option that produces forward allocation
2914 * patterns.
2915 * Preferring the lower bno extent will make the next
2916 * request use "fend" as the start of the next
2917 * allocation; if the segment is no longer busy at
2918 * that point, we'll get a contiguous allocation, but
2919 * even if it is still busy, we will get a forward
2920 * allocation.
2921 * We try to avoid choosing the segment at "bend",
2922 * because that can lead to the next allocation
2923 * taking the segment at "fbno", which would be a
2924 * backward allocation. We only use the segment at
2925 * "fbno" if it is much larger than the current
2926 * requested size, because in that case there's a
2927 * good chance subsequent allocations will be
2928 * contiguous.
2929 */
2930 if (bbno - fbno >= args->maxlen) {
2931 /* left candidate fits perfect */
2932 fend = bbno;
2933 } else if (fend - bend >= args->maxlen * 4) {
2934 /* right candidate has enough free space */
2935 fbno = bend;
2936 } else if (bbno - fbno >= args->minlen) {
2937 /* left candidate fits minimum requirement */
2938 fend = bbno;
2939 } else {
2940 goto fail;
2941 }
2942 }
2943
2944 flen = fend - fbno;
2945 }
2946 spin_unlock(&args->pag->pagb_lock);
2947
2948 if (fbno != bno || flen != len) {
2949 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len,
2950 fbno, flen);
2951 }
2952 *rbno = fbno;
2953 *rlen = flen;
2954 return;
2955fail:
2956 /*
2957 * Return a zero extent length as failure indications. All callers
2958 * re-check if the trimmed extent satisfies the minlen requirement.
2959 */
2960 spin_unlock(&args->pag->pagb_lock);
2961 trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
2962 *rbno = fbno;
2963 *rlen = 0;
2964}
2965
2966static void
2967xfs_alloc_busy_clear_one(
2968 struct xfs_mount *mp,
2969 struct xfs_perag *pag,
2970 struct xfs_busy_extent *busyp)
2971{
2972 if (busyp->length) {
2973 trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno,
2974 busyp->length);
2975 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2976 }
2977
2978 list_del_init(&busyp->list);
2697 kmem_free(busyp); 2979 kmem_free(busyp);
2698} 2980}
2981
2982void
2983xfs_alloc_busy_clear(
2984 struct xfs_mount *mp,
2985 struct list_head *list)
2986{
2987 struct xfs_busy_extent *busyp, *n;
2988 struct xfs_perag *pag = NULL;
2989 xfs_agnumber_t agno = NULLAGNUMBER;
2990
2991 list_for_each_entry_safe(busyp, n, list, list) {
2992 if (busyp->agno != agno) {
2993 if (pag) {
2994 spin_unlock(&pag->pagb_lock);
2995 xfs_perag_put(pag);
2996 }
2997 pag = xfs_perag_get(mp, busyp->agno);
2998 spin_lock(&pag->pagb_lock);
2999 agno = busyp->agno;
3000 }
3001
3002 xfs_alloc_busy_clear_one(mp, pag, busyp);
3003 }
3004
3005 if (pag) {
3006 spin_unlock(&pag->pagb_lock);
3007 xfs_perag_put(pag);
3008 }
3009}
3010
3011/*
3012 * Callback for list_sort to sort busy extents by the AG they reside in.
3013 */
3014int
3015xfs_busy_extent_ag_cmp(
3016 void *priv,
3017 struct list_head *a,
3018 struct list_head *b)
3019{
3020 return container_of(a, struct xfs_busy_extent, list)->agno -
3021 container_of(b, struct xfs_busy_extent, list)->agno;
3022}