diff options
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r-- | fs/xfs/xfs_alloc.c | 844 |
1 files changed, 584 insertions, 260 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 27d64d752eab..acdced86413c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c | |||
@@ -41,19 +41,13 @@ | |||
41 | #define XFSA_FIXUP_BNO_OK 1 | 41 | #define XFSA_FIXUP_BNO_OK 1 |
42 | #define XFSA_FIXUP_CNT_OK 2 | 42 | #define XFSA_FIXUP_CNT_OK 2 |
43 | 43 | ||
44 | /* | ||
45 | * Prototypes for per-ag allocation routines | ||
46 | */ | ||
47 | |||
48 | STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); | 44 | STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); |
49 | STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); | 45 | STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); |
50 | STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); | 46 | STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); |
51 | STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, | 47 | STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, |
52 | xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); | 48 | xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); |
53 | 49 | STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, | |
54 | /* | 50 | xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); |
55 | * Internal functions. | ||
56 | */ | ||
57 | 51 | ||
58 | /* | 52 | /* |
59 | * Lookup the record equal to [bno, len] in the btree given by cur. | 53 | * Lookup the record equal to [bno, len] in the btree given by cur. |
@@ -154,19 +148,21 @@ xfs_alloc_compute_aligned( | |||
154 | xfs_extlen_t *reslen) /* result length */ | 148 | xfs_extlen_t *reslen) /* result length */ |
155 | { | 149 | { |
156 | xfs_agblock_t bno; | 150 | xfs_agblock_t bno; |
157 | xfs_extlen_t diff; | ||
158 | xfs_extlen_t len; | 151 | xfs_extlen_t len; |
159 | 152 | ||
160 | if (args->alignment > 1 && foundlen >= args->minlen) { | 153 | /* Trim busy sections out of found extent */ |
161 | bno = roundup(foundbno, args->alignment); | 154 | xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); |
162 | diff = bno - foundbno; | 155 | |
163 | len = diff >= foundlen ? 0 : foundlen - diff; | 156 | if (args->alignment > 1 && len >= args->minlen) { |
157 | xfs_agblock_t aligned_bno = roundup(bno, args->alignment); | ||
158 | xfs_extlen_t diff = aligned_bno - bno; | ||
159 | |||
160 | *resbno = aligned_bno; | ||
161 | *reslen = diff >= len ? 0 : len - diff; | ||
164 | } else { | 162 | } else { |
165 | bno = foundbno; | 163 | *resbno = bno; |
166 | len = foundlen; | 164 | *reslen = len; |
167 | } | 165 | } |
168 | *resbno = bno; | ||
169 | *reslen = len; | ||
170 | } | 166 | } |
171 | 167 | ||
172 | /* | 168 | /* |
@@ -280,7 +276,6 @@ xfs_alloc_fix_minleft( | |||
280 | return 1; | 276 | return 1; |
281 | agf = XFS_BUF_TO_AGF(args->agbp); | 277 | agf = XFS_BUF_TO_AGF(args->agbp); |
282 | diff = be32_to_cpu(agf->agf_freeblks) | 278 | diff = be32_to_cpu(agf->agf_freeblks) |
283 | + be32_to_cpu(agf->agf_flcount) | ||
284 | - args->len - args->minleft; | 279 | - args->len - args->minleft; |
285 | if (diff >= 0) | 280 | if (diff >= 0) |
286 | return 1; | 281 | return 1; |
@@ -541,16 +536,8 @@ xfs_alloc_ag_vextent( | |||
541 | if (error) | 536 | if (error) |
542 | return error; | 537 | return error; |
543 | 538 | ||
544 | /* | 539 | ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, |
545 | * Search the busylist for these blocks and mark the | 540 | args->agbno, args->len)); |
546 | * transaction as synchronous if blocks are found. This | ||
547 | * avoids the need to block due to a synchronous log | ||
548 | * force to ensure correct ordering as the synchronous | ||
549 | * transaction will guarantee that for us. | ||
550 | */ | ||
551 | if (xfs_alloc_busy_search(args->mp, args->agno, | ||
552 | args->agbno, args->len)) | ||
553 | xfs_trans_set_sync(args->tp); | ||
554 | } | 541 | } |
555 | 542 | ||
556 | if (!args->isfl) { | 543 | if (!args->isfl) { |
@@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact( | |||
577 | { | 564 | { |
578 | xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ | 565 | xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ |
579 | xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ | 566 | xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ |
580 | xfs_agblock_t end; /* end of allocated extent */ | ||
581 | int error; | 567 | int error; |
582 | xfs_agblock_t fbno; /* start block of found extent */ | 568 | xfs_agblock_t fbno; /* start block of found extent */ |
583 | xfs_agblock_t fend; /* end block of found extent */ | ||
584 | xfs_extlen_t flen; /* length of found extent */ | 569 | xfs_extlen_t flen; /* length of found extent */ |
570 | xfs_agblock_t tbno; /* start block of trimmed extent */ | ||
571 | xfs_extlen_t tlen; /* length of trimmed extent */ | ||
572 | xfs_agblock_t tend; /* end block of trimmed extent */ | ||
573 | xfs_agblock_t end; /* end of allocated extent */ | ||
585 | int i; /* success/failure of operation */ | 574 | int i; /* success/failure of operation */ |
586 | xfs_agblock_t maxend; /* end of maximal extent */ | ||
587 | xfs_agblock_t minend; /* end of minimal extent */ | ||
588 | xfs_extlen_t rlen; /* length of returned extent */ | 575 | xfs_extlen_t rlen; /* length of returned extent */ |
589 | 576 | ||
590 | ASSERT(args->alignment == 1); | 577 | ASSERT(args->alignment == 1); |
@@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact( | |||
614 | goto error0; | 601 | goto error0; |
615 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); | 602 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); |
616 | ASSERT(fbno <= args->agbno); | 603 | ASSERT(fbno <= args->agbno); |
617 | minend = args->agbno + args->minlen; | ||
618 | maxend = args->agbno + args->maxlen; | ||
619 | fend = fbno + flen; | ||
620 | 604 | ||
621 | /* | 605 | /* |
622 | * Give up if the freespace isn't long enough for the minimum request. | 606 | * Check for overlapping busy extents. |
607 | */ | ||
608 | xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); | ||
609 | |||
610 | /* | ||
611 | * Give up if the start of the extent is busy, or the freespace isn't | ||
612 | * long enough for the minimum request. | ||
623 | */ | 613 | */ |
624 | if (fend < minend) | 614 | if (tbno > args->agbno) |
615 | goto not_found; | ||
616 | if (tlen < args->minlen) | ||
617 | goto not_found; | ||
618 | tend = tbno + tlen; | ||
619 | if (tend < args->agbno + args->minlen) | ||
625 | goto not_found; | 620 | goto not_found; |
626 | 621 | ||
627 | /* | 622 | /* |
@@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact( | |||
630 | * | 625 | * |
631 | * Fix the length according to mod and prod if given. | 626 | * Fix the length according to mod and prod if given. |
632 | */ | 627 | */ |
633 | end = XFS_AGBLOCK_MIN(fend, maxend); | 628 | end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen); |
634 | args->len = end - args->agbno; | 629 | args->len = end - args->agbno; |
635 | xfs_alloc_fix_len(args); | 630 | xfs_alloc_fix_len(args); |
636 | if (!xfs_alloc_fix_minleft(args)) | 631 | if (!xfs_alloc_fix_minleft(args)) |
637 | goto not_found; | 632 | goto not_found; |
638 | 633 | ||
639 | rlen = args->len; | 634 | rlen = args->len; |
640 | ASSERT(args->agbno + rlen <= fend); | 635 | ASSERT(args->agbno + rlen <= tend); |
641 | end = args->agbno + rlen; | 636 | end = args->agbno + rlen; |
642 | 637 | ||
643 | /* | 638 | /* |
@@ -686,11 +681,11 @@ xfs_alloc_find_best_extent( | |||
686 | struct xfs_btree_cur **scur, /* searching cursor */ | 681 | struct xfs_btree_cur **scur, /* searching cursor */ |
687 | xfs_agblock_t gdiff, /* difference for search comparison */ | 682 | xfs_agblock_t gdiff, /* difference for search comparison */ |
688 | xfs_agblock_t *sbno, /* extent found by search */ | 683 | xfs_agblock_t *sbno, /* extent found by search */ |
689 | xfs_extlen_t *slen, | 684 | xfs_extlen_t *slen, /* extent length */ |
690 | xfs_extlen_t *slena, /* aligned length */ | 685 | xfs_agblock_t *sbnoa, /* aligned extent found by search */ |
686 | xfs_extlen_t *slena, /* aligned extent length */ | ||
691 | int dir) /* 0 = search right, 1 = search left */ | 687 | int dir) /* 0 = search right, 1 = search left */ |
692 | { | 688 | { |
693 | xfs_agblock_t bno; | ||
694 | xfs_agblock_t new; | 689 | xfs_agblock_t new; |
695 | xfs_agblock_t sdiff; | 690 | xfs_agblock_t sdiff; |
696 | int error; | 691 | int error; |
@@ -708,16 +703,16 @@ xfs_alloc_find_best_extent( | |||
708 | if (error) | 703 | if (error) |
709 | goto error0; | 704 | goto error0; |
710 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); | 705 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); |
711 | xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena); | 706 | xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); |
712 | 707 | ||
713 | /* | 708 | /* |
714 | * The good extent is closer than this one. | 709 | * The good extent is closer than this one. |
715 | */ | 710 | */ |
716 | if (!dir) { | 711 | if (!dir) { |
717 | if (bno >= args->agbno + gdiff) | 712 | if (*sbnoa >= args->agbno + gdiff) |
718 | goto out_use_good; | 713 | goto out_use_good; |
719 | } else { | 714 | } else { |
720 | if (bno <= args->agbno - gdiff) | 715 | if (*sbnoa <= args->agbno - gdiff) |
721 | goto out_use_good; | 716 | goto out_use_good; |
722 | } | 717 | } |
723 | 718 | ||
@@ -729,8 +724,8 @@ xfs_alloc_find_best_extent( | |||
729 | xfs_alloc_fix_len(args); | 724 | xfs_alloc_fix_len(args); |
730 | 725 | ||
731 | sdiff = xfs_alloc_compute_diff(args->agbno, args->len, | 726 | sdiff = xfs_alloc_compute_diff(args->agbno, args->len, |
732 | args->alignment, *sbno, | 727 | args->alignment, *sbnoa, |
733 | *slen, &new); | 728 | *slena, &new); |
734 | 729 | ||
735 | /* | 730 | /* |
736 | * Choose closer size and invalidate other cursor. | 731 | * Choose closer size and invalidate other cursor. |
@@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near( | |||
780 | xfs_agblock_t gtbnoa; /* aligned ... */ | 775 | xfs_agblock_t gtbnoa; /* aligned ... */ |
781 | xfs_extlen_t gtdiff; /* difference to right side entry */ | 776 | xfs_extlen_t gtdiff; /* difference to right side entry */ |
782 | xfs_extlen_t gtlen; /* length of right side entry */ | 777 | xfs_extlen_t gtlen; /* length of right side entry */ |
783 | xfs_extlen_t gtlena = 0; /* aligned ... */ | 778 | xfs_extlen_t gtlena; /* aligned ... */ |
784 | xfs_agblock_t gtnew; /* useful start bno of right side */ | 779 | xfs_agblock_t gtnew; /* useful start bno of right side */ |
785 | int error; /* error code */ | 780 | int error; /* error code */ |
786 | int i; /* result code, temporary */ | 781 | int i; /* result code, temporary */ |
@@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near( | |||
789 | xfs_agblock_t ltbnoa; /* aligned ... */ | 784 | xfs_agblock_t ltbnoa; /* aligned ... */ |
790 | xfs_extlen_t ltdiff; /* difference to left side entry */ | 785 | xfs_extlen_t ltdiff; /* difference to left side entry */ |
791 | xfs_extlen_t ltlen; /* length of left side entry */ | 786 | xfs_extlen_t ltlen; /* length of left side entry */ |
792 | xfs_extlen_t ltlena = 0; /* aligned ... */ | 787 | xfs_extlen_t ltlena; /* aligned ... */ |
793 | xfs_agblock_t ltnew; /* useful start bno of left side */ | 788 | xfs_agblock_t ltnew; /* useful start bno of left side */ |
794 | xfs_extlen_t rlen; /* length of returned extent */ | 789 | xfs_extlen_t rlen; /* length of returned extent */ |
790 | int forced = 0; | ||
795 | #if defined(DEBUG) && defined(__KERNEL__) | 791 | #if defined(DEBUG) && defined(__KERNEL__) |
796 | /* | 792 | /* |
797 | * Randomly don't execute the first algorithm. | 793 | * Randomly don't execute the first algorithm. |
@@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near( | |||
800 | 796 | ||
801 | dofirst = random32() & 1; | 797 | dofirst = random32() & 1; |
802 | #endif | 798 | #endif |
799 | |||
800 | restart: | ||
801 | bno_cur_lt = NULL; | ||
802 | bno_cur_gt = NULL; | ||
803 | ltlen = 0; | ||
804 | gtlena = 0; | ||
805 | ltlena = 0; | ||
806 | |||
803 | /* | 807 | /* |
804 | * Get a cursor for the by-size btree. | 808 | * Get a cursor for the by-size btree. |
805 | */ | 809 | */ |
806 | cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, | 810 | cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, |
807 | args->agno, XFS_BTNUM_CNT); | 811 | args->agno, XFS_BTNUM_CNT); |
808 | ltlen = 0; | 812 | |
809 | bno_cur_lt = bno_cur_gt = NULL; | ||
810 | /* | 813 | /* |
811 | * See if there are any free extents as big as maxlen. | 814 | * See if there are any free extents as big as maxlen. |
812 | */ | 815 | */ |
@@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near( | |||
822 | goto error0; | 825 | goto error0; |
823 | if (i == 0 || ltlen == 0) { | 826 | if (i == 0 || ltlen == 0) { |
824 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); | 827 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); |
828 | trace_xfs_alloc_near_noentry(args); | ||
825 | return 0; | 829 | return 0; |
826 | } | 830 | } |
827 | ASSERT(i == 1); | 831 | ASSERT(i == 1); |
828 | } | 832 | } |
829 | args->wasfromfl = 0; | 833 | args->wasfromfl = 0; |
834 | |||
830 | /* | 835 | /* |
831 | * First algorithm. | 836 | * First algorithm. |
832 | * If the requested extent is large wrt the freespaces available | 837 | * If the requested extent is large wrt the freespaces available |
@@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near( | |||
890 | if (args->len < blen) | 895 | if (args->len < blen) |
891 | continue; | 896 | continue; |
892 | ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, | 897 | ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, |
893 | args->alignment, ltbno, ltlen, <new); | 898 | args->alignment, ltbnoa, ltlena, <new); |
894 | if (ltnew != NULLAGBLOCK && | 899 | if (ltnew != NULLAGBLOCK && |
895 | (args->len > blen || ltdiff < bdiff)) { | 900 | (args->len > blen || ltdiff < bdiff)) { |
896 | bdiff = ltdiff; | 901 | bdiff = ltdiff; |
@@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near( | |||
1042 | args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); | 1047 | args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); |
1043 | xfs_alloc_fix_len(args); | 1048 | xfs_alloc_fix_len(args); |
1044 | ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, | 1049 | ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, |
1045 | args->alignment, ltbno, ltlen, <new); | 1050 | args->alignment, ltbnoa, ltlena, <new); |
1046 | 1051 | ||
1047 | error = xfs_alloc_find_best_extent(args, | 1052 | error = xfs_alloc_find_best_extent(args, |
1048 | &bno_cur_lt, &bno_cur_gt, | 1053 | &bno_cur_lt, &bno_cur_gt, |
1049 | ltdiff, >bno, >len, >lena, | 1054 | ltdiff, >bno, >len, |
1055 | >bnoa, >lena, | ||
1050 | 0 /* search right */); | 1056 | 0 /* search right */); |
1051 | } else { | 1057 | } else { |
1052 | ASSERT(gtlena >= args->minlen); | 1058 | ASSERT(gtlena >= args->minlen); |
@@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near( | |||
1057 | args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); | 1063 | args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); |
1058 | xfs_alloc_fix_len(args); | 1064 | xfs_alloc_fix_len(args); |
1059 | gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, | 1065 | gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, |
1060 | args->alignment, gtbno, gtlen, >new); | 1066 | args->alignment, gtbnoa, gtlena, >new); |
1061 | 1067 | ||
1062 | error = xfs_alloc_find_best_extent(args, | 1068 | error = xfs_alloc_find_best_extent(args, |
1063 | &bno_cur_gt, &bno_cur_lt, | 1069 | &bno_cur_gt, &bno_cur_lt, |
1064 | gtdiff, <bno, <len, <lena, | 1070 | gtdiff, <bno, <len, |
1071 | <bnoa, <lena, | ||
1065 | 1 /* search left */); | 1072 | 1 /* search left */); |
1066 | } | 1073 | } |
1067 | 1074 | ||
@@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near( | |||
1073 | * If we couldn't get anything, give up. | 1080 | * If we couldn't get anything, give up. |
1074 | */ | 1081 | */ |
1075 | if (bno_cur_lt == NULL && bno_cur_gt == NULL) { | 1082 | if (bno_cur_lt == NULL && bno_cur_gt == NULL) { |
1083 | if (!forced++) { | ||
1084 | trace_xfs_alloc_near_busy(args); | ||
1085 | xfs_log_force(args->mp, XFS_LOG_SYNC); | ||
1086 | goto restart; | ||
1087 | } | ||
1088 | |||
1076 | trace_xfs_alloc_size_neither(args); | 1089 | trace_xfs_alloc_size_neither(args); |
1077 | args->agbno = NULLAGBLOCK; | 1090 | args->agbno = NULLAGBLOCK; |
1078 | return 0; | 1091 | return 0; |
@@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near( | |||
1107 | return 0; | 1120 | return 0; |
1108 | } | 1121 | } |
1109 | rlen = args->len; | 1122 | rlen = args->len; |
1110 | (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, | 1123 | (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, |
1111 | ltlen, <new); | 1124 | ltbnoa, ltlena, <new); |
1112 | ASSERT(ltnew >= ltbno); | 1125 | ASSERT(ltnew >= ltbno); |
1113 | ASSERT(ltnew + rlen <= ltbno + ltlen); | 1126 | ASSERT(ltnew + rlen <= ltbnoa + ltlena); |
1114 | ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); | 1127 | ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); |
1115 | args->agbno = ltnew; | 1128 | args->agbno = ltnew; |
1129 | |||
1116 | if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, | 1130 | if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, |
1117 | ltnew, rlen, XFSA_FIXUP_BNO_OK))) | 1131 | ltnew, rlen, XFSA_FIXUP_BNO_OK))) |
1118 | goto error0; | 1132 | goto error0; |
@@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size( | |||
1155 | int i; /* temp status variable */ | 1169 | int i; /* temp status variable */ |
1156 | xfs_agblock_t rbno; /* returned block number */ | 1170 | xfs_agblock_t rbno; /* returned block number */ |
1157 | xfs_extlen_t rlen; /* length of returned extent */ | 1171 | xfs_extlen_t rlen; /* length of returned extent */ |
1172 | int forced = 0; | ||
1158 | 1173 | ||
1174 | restart: | ||
1159 | /* | 1175 | /* |
1160 | * Allocate and initialize a cursor for the by-size btree. | 1176 | * Allocate and initialize a cursor for the by-size btree. |
1161 | */ | 1177 | */ |
1162 | cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, | 1178 | cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, |
1163 | args->agno, XFS_BTNUM_CNT); | 1179 | args->agno, XFS_BTNUM_CNT); |
1164 | bno_cur = NULL; | 1180 | bno_cur = NULL; |
1181 | |||
1165 | /* | 1182 | /* |
1166 | * Look for an entry >= maxlen+alignment-1 blocks. | 1183 | * Look for an entry >= maxlen+alignment-1 blocks. |
1167 | */ | 1184 | */ |
1168 | if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, | 1185 | if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, |
1169 | args->maxlen + args->alignment - 1, &i))) | 1186 | args->maxlen + args->alignment - 1, &i))) |
1170 | goto error0; | 1187 | goto error0; |
1188 | |||
1171 | /* | 1189 | /* |
1172 | * If none, then pick up the last entry in the tree unless the | 1190 | * If none or we have busy extents that we cannot allocate from, then |
1173 | * tree is empty. | 1191 | * we have to settle for a smaller extent. In the case that there are |
1192 | * no large extents, this will return the last entry in the tree unless | ||
1193 | * the tree is empty. In the case that there are only busy large | ||
1194 | * extents, this will return the largest small extent unless there | ||
1195 | * are no smaller extents available. | ||
1174 | */ | 1196 | */ |
1175 | if (!i) { | 1197 | if (!i || forced > 1) { |
1176 | if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, | 1198 | error = xfs_alloc_ag_vextent_small(args, cnt_cur, |
1177 | &flen, &i))) | 1199 | &fbno, &flen, &i); |
1200 | if (error) | ||
1178 | goto error0; | 1201 | goto error0; |
1179 | if (i == 0 || flen == 0) { | 1202 | if (i == 0 || flen == 0) { |
1180 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); | 1203 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); |
@@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size( | |||
1182 | return 0; | 1205 | return 0; |
1183 | } | 1206 | } |
1184 | ASSERT(i == 1); | 1207 | ASSERT(i == 1); |
1208 | xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); | ||
1209 | } else { | ||
1210 | /* | ||
1211 | * Search for a non-busy extent that is large enough. | ||
1212 | * If we are at low space, don't check, or if we fall of | ||
1213 | * the end of the btree, turn off the busy check and | ||
1214 | * restart. | ||
1215 | */ | ||
1216 | for (;;) { | ||
1217 | error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); | ||
1218 | if (error) | ||
1219 | goto error0; | ||
1220 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); | ||
1221 | |||
1222 | xfs_alloc_compute_aligned(args, fbno, flen, | ||
1223 | &rbno, &rlen); | ||
1224 | |||
1225 | if (rlen >= args->maxlen) | ||
1226 | break; | ||
1227 | |||
1228 | error = xfs_btree_increment(cnt_cur, 0, &i); | ||
1229 | if (error) | ||
1230 | goto error0; | ||
1231 | if (i == 0) { | ||
1232 | /* | ||
1233 | * Our only valid extents must have been busy. | ||
1234 | * Make it unbusy by forcing the log out and | ||
1235 | * retrying. If we've been here before, forcing | ||
1236 | * the log isn't making the extents available, | ||
1237 | * which means they have probably been freed in | ||
1238 | * this transaction. In that case, we have to | ||
1239 | * give up on them and we'll attempt a minlen | ||
1240 | * allocation the next time around. | ||
1241 | */ | ||
1242 | xfs_btree_del_cursor(cnt_cur, | ||
1243 | XFS_BTREE_NOERROR); | ||
1244 | trace_xfs_alloc_size_busy(args); | ||
1245 | if (!forced++) | ||
1246 | xfs_log_force(args->mp, XFS_LOG_SYNC); | ||
1247 | goto restart; | ||
1248 | } | ||
1249 | } | ||
1185 | } | 1250 | } |
1186 | /* | 1251 | |
1187 | * There's a freespace as big as maxlen+alignment-1, get it. | ||
1188 | */ | ||
1189 | else { | ||
1190 | if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) | ||
1191 | goto error0; | ||
1192 | XFS_WANT_CORRUPTED_GOTO(i == 1, error0); | ||
1193 | } | ||
1194 | /* | 1252 | /* |
1195 | * In the first case above, we got the last entry in the | 1253 | * In the first case above, we got the last entry in the |
1196 | * by-size btree. Now we check to see if the space hits maxlen | 1254 | * by-size btree. Now we check to see if the space hits maxlen |
1197 | * once aligned; if not, we search left for something better. | 1255 | * once aligned; if not, we search left for something better. |
1198 | * This can't happen in the second case above. | 1256 | * This can't happen in the second case above. |
1199 | */ | 1257 | */ |
1200 | xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); | ||
1201 | rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); | 1258 | rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); |
1202 | XFS_WANT_CORRUPTED_GOTO(rlen == 0 || | 1259 | XFS_WANT_CORRUPTED_GOTO(rlen == 0 || |
1203 | (rlen <= flen && rbno + rlen <= fbno + flen), error0); | 1260 | (rlen <= flen && rbno + rlen <= fbno + flen), error0); |
@@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size( | |||
1251 | * Fix up the length. | 1308 | * Fix up the length. |
1252 | */ | 1309 | */ |
1253 | args->len = rlen; | 1310 | args->len = rlen; |
1254 | xfs_alloc_fix_len(args); | 1311 | if (rlen < args->minlen) { |
1255 | if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { | 1312 | if (!forced++) { |
1256 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); | 1313 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); |
1257 | trace_xfs_alloc_size_nominleft(args); | 1314 | trace_xfs_alloc_size_busy(args); |
1258 | args->agbno = NULLAGBLOCK; | 1315 | xfs_log_force(args->mp, XFS_LOG_SYNC); |
1259 | return 0; | 1316 | goto restart; |
1317 | } | ||
1318 | goto out_nominleft; | ||
1260 | } | 1319 | } |
1320 | xfs_alloc_fix_len(args); | ||
1321 | |||
1322 | if (!xfs_alloc_fix_minleft(args)) | ||
1323 | goto out_nominleft; | ||
1261 | rlen = args->len; | 1324 | rlen = args->len; |
1262 | XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); | 1325 | XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); |
1263 | /* | 1326 | /* |
@@ -1287,6 +1350,12 @@ error0: | |||
1287 | if (bno_cur) | 1350 | if (bno_cur) |
1288 | xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); | 1351 | xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); |
1289 | return error; | 1352 | return error; |
1353 | |||
1354 | out_nominleft: | ||
1355 | xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); | ||
1356 | trace_xfs_alloc_size_nominleft(args); | ||
1357 | args->agbno = NULLAGBLOCK; | ||
1358 | return 0; | ||
1290 | } | 1359 | } |
1291 | 1360 | ||
1292 | /* | 1361 | /* |
@@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small( | |||
1326 | if (error) | 1395 | if (error) |
1327 | goto error0; | 1396 | goto error0; |
1328 | if (fbno != NULLAGBLOCK) { | 1397 | if (fbno != NULLAGBLOCK) { |
1398 | xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, | ||
1399 | args->userdata); | ||
1400 | |||
1329 | if (args->userdata) { | 1401 | if (args->userdata) { |
1330 | xfs_buf_t *bp; | 1402 | xfs_buf_t *bp; |
1331 | 1403 | ||
@@ -1617,18 +1689,6 @@ xfs_free_ag_extent( | |||
1617 | 1689 | ||
1618 | trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); | 1690 | trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); |
1619 | 1691 | ||
1620 | /* | ||
1621 | * Since blocks move to the free list without the coordination | ||
1622 | * used in xfs_bmap_finish, we can't allow block to be available | ||
1623 | * for reallocation and non-transaction writing (user data) | ||
1624 | * until we know that the transaction that moved it to the free | ||
1625 | * list is permanently on disk. We track the blocks by declaring | ||
1626 | * these blocks as "busy"; the busy list is maintained on a per-ag | ||
1627 | * basis and each transaction records which entries should be removed | ||
1628 | * when the iclog commits to disk. If a busy block is allocated, | ||
1629 | * the iclog is pushed up to the LSN that freed the block. | ||
1630 | */ | ||
1631 | xfs_alloc_busy_insert(tp, agno, bno, len); | ||
1632 | return 0; | 1692 | return 0; |
1633 | 1693 | ||
1634 | error0: | 1694 | error0: |
@@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist( | |||
1923 | xfs_alloc_log_agf(tp, agbp, logflags); | 1983 | xfs_alloc_log_agf(tp, agbp, logflags); |
1924 | *bnop = bno; | 1984 | *bnop = bno; |
1925 | 1985 | ||
1926 | /* | ||
1927 | * As blocks are freed, they are added to the per-ag busy list and | ||
1928 | * remain there until the freeing transaction is committed to disk. | ||
1929 | * Now that we have allocated blocks, this list must be searched to see | ||
1930 | * if a block is being reused. If one is, then the freeing transaction | ||
1931 | * must be pushed to disk before this transaction. | ||
1932 | * | ||
1933 | * We do this by setting the current transaction to a sync transaction | ||
1934 | * which guarantees that the freeing transaction is on disk before this | ||
1935 | * transaction. This is done instead of a synchronous log force here so | ||
1936 | * that we don't sit and wait with the AGF locked in the transaction | ||
1937 | * during the log force. | ||
1938 | */ | ||
1939 | if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) | ||
1940 | xfs_trans_set_sync(tp); | ||
1941 | return 0; | 1986 | return 0; |
1942 | } | 1987 | } |
1943 | 1988 | ||
@@ -2423,105 +2468,13 @@ xfs_free_extent( | |||
2423 | } | 2468 | } |
2424 | 2469 | ||
2425 | error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); | 2470 | error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); |
2471 | if (!error) | ||
2472 | xfs_alloc_busy_insert(tp, args.agno, args.agbno, len); | ||
2426 | error0: | 2473 | error0: |
2427 | xfs_perag_put(args.pag); | 2474 | xfs_perag_put(args.pag); |
2428 | return error; | 2475 | return error; |
2429 | } | 2476 | } |
2430 | 2477 | ||
2431 | |||
2432 | /* | ||
2433 | * AG Busy list management | ||
2434 | * The busy list contains block ranges that have been freed but whose | ||
2435 | * transactions have not yet hit disk. If any block listed in a busy | ||
2436 | * list is reused, the transaction that freed it must be forced to disk | ||
2437 | * before continuing to use the block. | ||
2438 | * | ||
2439 | * xfs_alloc_busy_insert - add to the per-ag busy list | ||
2440 | * xfs_alloc_busy_clear - remove an item from the per-ag busy list | ||
2441 | * xfs_alloc_busy_search - search for a busy extent | ||
2442 | */ | ||
2443 | |||
2444 | /* | ||
2445 | * Insert a new extent into the busy tree. | ||
2446 | * | ||
2447 | * The busy extent tree is indexed by the start block of the busy extent. | ||
2448 | * there can be multiple overlapping ranges in the busy extent tree but only | ||
2449 | * ever one entry at a given start block. The reason for this is that | ||
2450 | * multi-block extents can be freed, then smaller chunks of that extent | ||
2451 | * allocated and freed again before the first transaction commit is on disk. | ||
2452 | * If the exact same start block is freed a second time, we have to wait for | ||
2453 | * that busy extent to pass out of the tree before the new extent is inserted. | ||
2454 | * There are two main cases we have to handle here. | ||
2455 | * | ||
2456 | * The first case is a transaction that triggers a "free - allocate - free" | ||
2457 | * cycle. This can occur during btree manipulations as a btree block is freed | ||
2458 | * to the freelist, then allocated from the free list, then freed again. In | ||
2459 | * this case, the second extxpnet free is what triggers the duplicate and as | ||
2460 | * such the transaction IDs should match. Because the extent was allocated in | ||
2461 | * this transaction, the transaction must be marked as synchronous. This is | ||
2462 | * true for all cases where the free/alloc/free occurs in the one transaction, | ||
2463 | * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. | ||
2464 | * This serves to catch violations of the second case quite effectively. | ||
2465 | * | ||
2466 | * The second case is where the free/alloc/free occur in different | ||
2467 | * transactions. In this case, the thread freeing the extent the second time | ||
2468 | * can't mark the extent busy immediately because it is already tracked in a | ||
2469 | * transaction that may be committing. When the log commit for the existing | ||
2470 | * busy extent completes, the busy extent will be removed from the tree. If we | ||
2471 | * allow the second busy insert to continue using that busy extent structure, | ||
2472 | * it can be freed before this transaction is safely in the log. Hence our | ||
2473 | * only option in this case is to force the log to remove the existing busy | ||
2474 | * extent from the list before we insert the new one with the current | ||
2475 | * transaction ID. | ||
2476 | * | ||
2477 | * The problem we are trying to avoid in the free-alloc-free in separate | ||
2478 | * transactions is most easily described with a timeline: | ||
2479 | * | ||
2480 | * Thread 1 Thread 2 Thread 3 xfslogd | ||
2481 | * xact alloc | ||
2482 | * free X | ||
2483 | * mark busy | ||
2484 | * commit xact | ||
2485 | * free xact | ||
2486 | * xact alloc | ||
2487 | * alloc X | ||
2488 | * busy search | ||
2489 | * mark xact sync | ||
2490 | * commit xact | ||
2491 | * free xact | ||
2492 | * force log | ||
2493 | * checkpoint starts | ||
2494 | * .... | ||
2495 | * xact alloc | ||
2496 | * free X | ||
2497 | * mark busy | ||
2498 | * finds match | ||
2499 | * *** KABOOM! *** | ||
2500 | * .... | ||
2501 | * log IO completes | ||
2502 | * unbusy X | ||
2503 | * checkpoint completes | ||
2504 | * | ||
2505 | * By issuing a log force in thread 3 @ "KABOOM", the thread will block until | ||
2506 | * the checkpoint completes, and the busy extent it matched will have been | ||
2507 | * removed from the tree when it is woken. Hence it can then continue safely. | ||
2508 | * | ||
2509 | * However, to ensure this matching process is robust, we need to use the | ||
2510 | * transaction ID for identifying transaction, as delayed logging results in | ||
2511 | * the busy extent and transaction lifecycles being different. i.e. the busy | ||
2512 | * extent is active for a lot longer than the transaction. Hence the | ||
2513 | * transaction structure can be freed and reallocated, then mark the same | ||
2514 | * extent busy again in the new transaction. In this case the new transaction | ||
2515 | * will have a different tid but can have the same address, and hence we need | ||
2516 | * to check against the tid. | ||
2517 | * | ||
2518 | * Future: for delayed logging, we could avoid the log force if the extent was | ||
2519 | * first freed in the current checkpoint sequence. This, however, requires the | ||
2520 | * ability to pin the current checkpoint in memory until this transaction | ||
2521 | * commits to ensure that both the original free and the current one combine | ||
2522 | * logically into the one checkpoint. If the checkpoint sequences are | ||
2523 | * different, however, we still need to wait on a log force. | ||
2524 | */ | ||
2525 | void | 2478 | void |
2526 | xfs_alloc_busy_insert( | 2479 | xfs_alloc_busy_insert( |
2527 | struct xfs_trans *tp, | 2480 | struct xfs_trans *tp, |
@@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert( | |||
2533 | struct xfs_busy_extent *busyp; | 2486 | struct xfs_busy_extent *busyp; |
2534 | struct xfs_perag *pag; | 2487 | struct xfs_perag *pag; |
2535 | struct rb_node **rbp; | 2488 | struct rb_node **rbp; |
2536 | struct rb_node *parent; | 2489 | struct rb_node *parent = NULL; |
2537 | int match; | ||
2538 | |||
2539 | 2490 | ||
2540 | new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); | 2491 | new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); |
2541 | if (!new) { | 2492 | if (!new) { |
@@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert( | |||
2544 | * block, make this a synchronous transaction to insure that | 2495 | * block, make this a synchronous transaction to insure that |
2545 | * the block is not reused before this transaction commits. | 2496 | * the block is not reused before this transaction commits. |
2546 | */ | 2497 | */ |
2547 | trace_xfs_alloc_busy(tp, agno, bno, len, 1); | 2498 | trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); |
2548 | xfs_trans_set_sync(tp); | 2499 | xfs_trans_set_sync(tp); |
2549 | return; | 2500 | return; |
2550 | } | 2501 | } |
@@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert( | |||
2552 | new->agno = agno; | 2503 | new->agno = agno; |
2553 | new->bno = bno; | 2504 | new->bno = bno; |
2554 | new->length = len; | 2505 | new->length = len; |
2555 | new->tid = xfs_log_get_trans_ident(tp); | ||
2556 | |||
2557 | INIT_LIST_HEAD(&new->list); | 2506 | INIT_LIST_HEAD(&new->list); |
2558 | 2507 | ||
2559 | /* trace before insert to be able to see failed inserts */ | 2508 | /* trace before insert to be able to see failed inserts */ |
2560 | trace_xfs_alloc_busy(tp, agno, bno, len, 0); | 2509 | trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); |
2561 | 2510 | ||
2562 | pag = xfs_perag_get(tp->t_mountp, new->agno); | 2511 | pag = xfs_perag_get(tp->t_mountp, new->agno); |
2563 | restart: | ||
2564 | spin_lock(&pag->pagb_lock); | 2512 | spin_lock(&pag->pagb_lock); |
2565 | rbp = &pag->pagb_tree.rb_node; | 2513 | rbp = &pag->pagb_tree.rb_node; |
2566 | parent = NULL; | 2514 | while (*rbp) { |
2567 | busyp = NULL; | ||
2568 | match = 0; | ||
2569 | while (*rbp && match >= 0) { | ||
2570 | parent = *rbp; | 2515 | parent = *rbp; |
2571 | busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); | 2516 | busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); |
2572 | 2517 | ||
2573 | if (new->bno < busyp->bno) { | 2518 | if (new->bno < busyp->bno) { |
2574 | /* may overlap, but exact start block is lower */ | ||
2575 | rbp = &(*rbp)->rb_left; | 2519 | rbp = &(*rbp)->rb_left; |
2576 | if (new->bno + new->length > busyp->bno) | 2520 | ASSERT(new->bno + new->length <= busyp->bno); |
2577 | match = busyp->tid == new->tid ? 1 : -1; | ||
2578 | } else if (new->bno > busyp->bno) { | 2521 | } else if (new->bno > busyp->bno) { |
2579 | /* may overlap, but exact start block is higher */ | ||
2580 | rbp = &(*rbp)->rb_right; | 2522 | rbp = &(*rbp)->rb_right; |
2581 | if (bno < busyp->bno + busyp->length) | 2523 | ASSERT(bno >= busyp->bno + busyp->length); |
2582 | match = busyp->tid == new->tid ? 1 : -1; | ||
2583 | } else { | 2524 | } else { |
2584 | match = busyp->tid == new->tid ? 1 : -1; | 2525 | ASSERT(0); |
2585 | break; | ||
2586 | } | 2526 | } |
2587 | } | 2527 | } |
2588 | if (match < 0) { | ||
2589 | /* overlap marked busy in different transaction */ | ||
2590 | spin_unlock(&pag->pagb_lock); | ||
2591 | xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); | ||
2592 | goto restart; | ||
2593 | } | ||
2594 | if (match > 0) { | ||
2595 | /* | ||
2596 | * overlap marked busy in same transaction. Update if exact | ||
2597 | * start block match, otherwise combine the busy extents into | ||
2598 | * a single range. | ||
2599 | */ | ||
2600 | if (busyp->bno == new->bno) { | ||
2601 | busyp->length = max(busyp->length, new->length); | ||
2602 | spin_unlock(&pag->pagb_lock); | ||
2603 | ASSERT(tp->t_flags & XFS_TRANS_SYNC); | ||
2604 | xfs_perag_put(pag); | ||
2605 | kmem_free(new); | ||
2606 | return; | ||
2607 | } | ||
2608 | rb_erase(&busyp->rb_node, &pag->pagb_tree); | ||
2609 | new->length = max(busyp->bno + busyp->length, | ||
2610 | new->bno + new->length) - | ||
2611 | min(busyp->bno, new->bno); | ||
2612 | new->bno = min(busyp->bno, new->bno); | ||
2613 | } else | ||
2614 | busyp = NULL; | ||
2615 | 2528 | ||
2616 | rb_link_node(&new->rb_node, parent, rbp); | 2529 | rb_link_node(&new->rb_node, parent, rbp); |
2617 | rb_insert_color(&new->rb_node, &pag->pagb_tree); | 2530 | rb_insert_color(&new->rb_node, &pag->pagb_tree); |
@@ -2619,7 +2532,6 @@ restart: | |||
2619 | list_add(&new->list, &tp->t_busy); | 2532 | list_add(&new->list, &tp->t_busy); |
2620 | spin_unlock(&pag->pagb_lock); | 2533 | spin_unlock(&pag->pagb_lock); |
2621 | xfs_perag_put(pag); | 2534 | xfs_perag_put(pag); |
2622 | kmem_free(busyp); | ||
2623 | } | 2535 | } |
2624 | 2536 | ||
2625 | /* | 2537 | /* |
@@ -2668,31 +2580,443 @@ xfs_alloc_busy_search( | |||
2668 | } | 2580 | } |
2669 | } | 2581 | } |
2670 | spin_unlock(&pag->pagb_lock); | 2582 | spin_unlock(&pag->pagb_lock); |
2671 | trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); | ||
2672 | xfs_perag_put(pag); | 2583 | xfs_perag_put(pag); |
2673 | return match; | 2584 | return match; |
2674 | } | 2585 | } |
2675 | 2586 | ||
2587 | /* | ||
2588 | * The found free extent [fbno, fend] overlaps part or all of the given busy | ||
2589 | * extent. If the overlap covers the beginning, the end, or all of the busy | ||
2590 | * extent, the overlapping portion can be made unbusy and used for the | ||
2591 | * allocation. We can't split a busy extent because we can't modify a | ||
2592 | * transaction/CIL context busy list, but we can update an entries block | ||
2593 | * number or length. | ||
2594 | * | ||
2595 | * Returns true if the extent can safely be reused, or false if the search | ||
2596 | * needs to be restarted. | ||
2597 | */ | ||
2598 | STATIC bool | ||
2599 | xfs_alloc_busy_update_extent( | ||
2600 | struct xfs_mount *mp, | ||
2601 | struct xfs_perag *pag, | ||
2602 | struct xfs_busy_extent *busyp, | ||
2603 | xfs_agblock_t fbno, | ||
2604 | xfs_extlen_t flen, | ||
2605 | bool userdata) | ||
2606 | { | ||
2607 | xfs_agblock_t fend = fbno + flen; | ||
2608 | xfs_agblock_t bbno = busyp->bno; | ||
2609 | xfs_agblock_t bend = bbno + busyp->length; | ||
2610 | |||
2611 | /* | ||
2612 | * If there is a busy extent overlapping a user allocation, we have | ||
2613 | * no choice but to force the log and retry the search. | ||
2614 | * | ||
2615 | * Fortunately this does not happen during normal operation, but | ||
2616 | * only if the filesystem is very low on space and has to dip into | ||
2617 | * the AGFL for normal allocations. | ||
2618 | */ | ||
2619 | if (userdata) | ||
2620 | goto out_force_log; | ||
2621 | |||
2622 | if (bbno < fbno && bend > fend) { | ||
2623 | /* | ||
2624 | * Case 1: | ||
2625 | * bbno bend | ||
2626 | * +BBBBBBBBBBBBBBBBB+ | ||
2627 | * +---------+ | ||
2628 | * fbno fend | ||
2629 | */ | ||
2630 | |||
2631 | /* | ||
2632 | * We would have to split the busy extent to be able to track | ||
2633 | * it correct, which we cannot do because we would have to | ||
2634 | * modify the list of busy extents attached to the transaction | ||
2635 | * or CIL context, which is immutable. | ||
2636 | * | ||
2637 | * Force out the log to clear the busy extent and retry the | ||
2638 | * search. | ||
2639 | */ | ||
2640 | goto out_force_log; | ||
2641 | } else if (bbno >= fbno && bend <= fend) { | ||
2642 | /* | ||
2643 | * Case 2: | ||
2644 | * bbno bend | ||
2645 | * +BBBBBBBBBBBBBBBBB+ | ||
2646 | * +-----------------+ | ||
2647 | * fbno fend | ||
2648 | * | ||
2649 | * Case 3: | ||
2650 | * bbno bend | ||
2651 | * +BBBBBBBBBBBBBBBBB+ | ||
2652 | * +--------------------------+ | ||
2653 | * fbno fend | ||
2654 | * | ||
2655 | * Case 4: | ||
2656 | * bbno bend | ||
2657 | * +BBBBBBBBBBBBBBBBB+ | ||
2658 | * +--------------------------+ | ||
2659 | * fbno fend | ||
2660 | * | ||
2661 | * Case 5: | ||
2662 | * bbno bend | ||
2663 | * +BBBBBBBBBBBBBBBBB+ | ||
2664 | * +-----------------------------------+ | ||
2665 | * fbno fend | ||
2666 | * | ||
2667 | */ | ||
2668 | |||
2669 | /* | ||
2670 | * The busy extent is fully covered by the extent we are | ||
2671 | * allocating, and can simply be removed from the rbtree. | ||
2672 | * However we cannot remove it from the immutable list | ||
2673 | * tracking busy extents in the transaction or CIL context, | ||
2674 | * so set the length to zero to mark it invalid. | ||
2675 | * | ||
2676 | * We also need to restart the busy extent search from the | ||
2677 | * tree root, because erasing the node can rearrange the | ||
2678 | * tree topology. | ||
2679 | */ | ||
2680 | rb_erase(&busyp->rb_node, &pag->pagb_tree); | ||
2681 | busyp->length = 0; | ||
2682 | return false; | ||
2683 | } else if (fend < bend) { | ||
2684 | /* | ||
2685 | * Case 6: | ||
2686 | * bbno bend | ||
2687 | * +BBBBBBBBBBBBBBBBB+ | ||
2688 | * +---------+ | ||
2689 | * fbno fend | ||
2690 | * | ||
2691 | * Case 7: | ||
2692 | * bbno bend | ||
2693 | * +BBBBBBBBBBBBBBBBB+ | ||
2694 | * +------------------+ | ||
2695 | * fbno fend | ||
2696 | * | ||
2697 | */ | ||
2698 | busyp->bno = fend; | ||
2699 | } else if (bbno < fbno) { | ||
2700 | /* | ||
2701 | * Case 8: | ||
2702 | * bbno bend | ||
2703 | * +BBBBBBBBBBBBBBBBB+ | ||
2704 | * +-------------+ | ||
2705 | * fbno fend | ||
2706 | * | ||
2707 | * Case 9: | ||
2708 | * bbno bend | ||
2709 | * +BBBBBBBBBBBBBBBBB+ | ||
2710 | * +----------------------+ | ||
2711 | * fbno fend | ||
2712 | */ | ||
2713 | busyp->length = fbno - busyp->bno; | ||
2714 | } else { | ||
2715 | ASSERT(0); | ||
2716 | } | ||
2717 | |||
2718 | trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); | ||
2719 | return true; | ||
2720 | |||
2721 | out_force_log: | ||
2722 | spin_unlock(&pag->pagb_lock); | ||
2723 | xfs_log_force(mp, XFS_LOG_SYNC); | ||
2724 | trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); | ||
2725 | spin_lock(&pag->pagb_lock); | ||
2726 | return false; | ||
2727 | } | ||
2728 | |||
2729 | |||
2730 | /* | ||
2731 | * For a given extent [fbno, flen], make sure we can reuse it safely. | ||
2732 | */ | ||
2676 | void | 2733 | void |
2677 | xfs_alloc_busy_clear( | 2734 | xfs_alloc_busy_reuse( |
2678 | struct xfs_mount *mp, | 2735 | struct xfs_mount *mp, |
2679 | struct xfs_busy_extent *busyp) | 2736 | xfs_agnumber_t agno, |
2737 | xfs_agblock_t fbno, | ||
2738 | xfs_extlen_t flen, | ||
2739 | bool userdata) | ||
2680 | { | 2740 | { |
2681 | struct xfs_perag *pag; | 2741 | struct xfs_perag *pag; |
2742 | struct rb_node *rbp; | ||
2682 | 2743 | ||
2683 | trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, | 2744 | ASSERT(flen > 0); |
2684 | busyp->length); | ||
2685 | 2745 | ||
2686 | ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, | 2746 | pag = xfs_perag_get(mp, agno); |
2687 | busyp->length) == 1); | 2747 | spin_lock(&pag->pagb_lock); |
2748 | restart: | ||
2749 | rbp = pag->pagb_tree.rb_node; | ||
2750 | while (rbp) { | ||
2751 | struct xfs_busy_extent *busyp = | ||
2752 | rb_entry(rbp, struct xfs_busy_extent, rb_node); | ||
2753 | xfs_agblock_t bbno = busyp->bno; | ||
2754 | xfs_agblock_t bend = bbno + busyp->length; | ||
2688 | 2755 | ||
2689 | list_del_init(&busyp->list); | 2756 | if (fbno + flen <= bbno) { |
2757 | rbp = rbp->rb_left; | ||
2758 | continue; | ||
2759 | } else if (fbno >= bend) { | ||
2760 | rbp = rbp->rb_right; | ||
2761 | continue; | ||
2762 | } | ||
2690 | 2763 | ||
2691 | pag = xfs_perag_get(mp, busyp->agno); | 2764 | if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, |
2692 | spin_lock(&pag->pagb_lock); | 2765 | userdata)) |
2693 | rb_erase(&busyp->rb_node, &pag->pagb_tree); | 2766 | goto restart; |
2767 | } | ||
2694 | spin_unlock(&pag->pagb_lock); | 2768 | spin_unlock(&pag->pagb_lock); |
2695 | xfs_perag_put(pag); | 2769 | xfs_perag_put(pag); |
2770 | } | ||
2771 | |||
2772 | /* | ||
2773 | * For a given extent [fbno, flen], search the busy extent list to find a | ||
2774 | * subset of the extent that is not busy. If *rlen is smaller than | ||
2775 | * args->minlen no suitable extent could be found, and the higher level | ||
2776 | * code needs to force out the log and retry the allocation. | ||
2777 | */ | ||
2778 | STATIC void | ||
2779 | xfs_alloc_busy_trim( | ||
2780 | struct xfs_alloc_arg *args, | ||
2781 | xfs_agblock_t bno, | ||
2782 | xfs_extlen_t len, | ||
2783 | xfs_agblock_t *rbno, | ||
2784 | xfs_extlen_t *rlen) | ||
2785 | { | ||
2786 | xfs_agblock_t fbno; | ||
2787 | xfs_extlen_t flen; | ||
2788 | struct rb_node *rbp; | ||
2789 | |||
2790 | ASSERT(len > 0); | ||
2696 | 2791 | ||
2792 | spin_lock(&args->pag->pagb_lock); | ||
2793 | restart: | ||
2794 | fbno = bno; | ||
2795 | flen = len; | ||
2796 | rbp = args->pag->pagb_tree.rb_node; | ||
2797 | while (rbp && flen >= args->minlen) { | ||
2798 | struct xfs_busy_extent *busyp = | ||
2799 | rb_entry(rbp, struct xfs_busy_extent, rb_node); | ||
2800 | xfs_agblock_t fend = fbno + flen; | ||
2801 | xfs_agblock_t bbno = busyp->bno; | ||
2802 | xfs_agblock_t bend = bbno + busyp->length; | ||
2803 | |||
2804 | if (fend <= bbno) { | ||
2805 | rbp = rbp->rb_left; | ||
2806 | continue; | ||
2807 | } else if (fbno >= bend) { | ||
2808 | rbp = rbp->rb_right; | ||
2809 | continue; | ||
2810 | } | ||
2811 | |||
2812 | /* | ||
2813 | * If this is a metadata allocation, try to reuse the busy | ||
2814 | * extent instead of trimming the allocation. | ||
2815 | */ | ||
2816 | if (!args->userdata) { | ||
2817 | if (!xfs_alloc_busy_update_extent(args->mp, args->pag, | ||
2818 | busyp, fbno, flen, | ||
2819 | false)) | ||
2820 | goto restart; | ||
2821 | continue; | ||
2822 | } | ||
2823 | |||
2824 | if (bbno <= fbno) { | ||
2825 | /* start overlap */ | ||
2826 | |||
2827 | /* | ||
2828 | * Case 1: | ||
2829 | * bbno bend | ||
2830 | * +BBBBBBBBBBBBBBBBB+ | ||
2831 | * +---------+ | ||
2832 | * fbno fend | ||
2833 | * | ||
2834 | * Case 2: | ||
2835 | * bbno bend | ||
2836 | * +BBBBBBBBBBBBBBBBB+ | ||
2837 | * +-------------+ | ||
2838 | * fbno fend | ||
2839 | * | ||
2840 | * Case 3: | ||
2841 | * bbno bend | ||
2842 | * +BBBBBBBBBBBBBBBBB+ | ||
2843 | * +-------------+ | ||
2844 | * fbno fend | ||
2845 | * | ||
2846 | * Case 4: | ||
2847 | * bbno bend | ||
2848 | * +BBBBBBBBBBBBBBBBB+ | ||
2849 | * +-----------------+ | ||
2850 | * fbno fend | ||
2851 | * | ||
2852 | * No unbusy region in extent, return failure. | ||
2853 | */ | ||
2854 | if (fend <= bend) | ||
2855 | goto fail; | ||
2856 | |||
2857 | /* | ||
2858 | * Case 5: | ||
2859 | * bbno bend | ||
2860 | * +BBBBBBBBBBBBBBBBB+ | ||
2861 | * +----------------------+ | ||
2862 | * fbno fend | ||
2863 | * | ||
2864 | * Case 6: | ||
2865 | * bbno bend | ||
2866 | * +BBBBBBBBBBBBBBBBB+ | ||
2867 | * +--------------------------+ | ||
2868 | * fbno fend | ||
2869 | * | ||
2870 | * Needs to be trimmed to: | ||
2871 | * +-------+ | ||
2872 | * fbno fend | ||
2873 | */ | ||
2874 | fbno = bend; | ||
2875 | } else if (bend >= fend) { | ||
2876 | /* end overlap */ | ||
2877 | |||
2878 | /* | ||
2879 | * Case 7: | ||
2880 | * bbno bend | ||
2881 | * +BBBBBBBBBBBBBBBBB+ | ||
2882 | * +------------------+ | ||
2883 | * fbno fend | ||
2884 | * | ||
2885 | * Case 8: | ||
2886 | * bbno bend | ||
2887 | * +BBBBBBBBBBBBBBBBB+ | ||
2888 | * +--------------------------+ | ||
2889 | * fbno fend | ||
2890 | * | ||
2891 | * Needs to be trimmed to: | ||
2892 | * +-------+ | ||
2893 | * fbno fend | ||
2894 | */ | ||
2895 | fend = bbno; | ||
2896 | } else { | ||
2897 | /* middle overlap */ | ||
2898 | |||
2899 | /* | ||
2900 | * Case 9: | ||
2901 | * bbno bend | ||
2902 | * +BBBBBBBBBBBBBBBBB+ | ||
2903 | * +-----------------------------------+ | ||
2904 | * fbno fend | ||
2905 | * | ||
2906 | * Can be trimmed to: | ||
2907 | * +-------+ OR +-------+ | ||
2908 | * fbno fend fbno fend | ||
2909 | * | ||
2910 | * Backward allocation leads to significant | ||
2911 | * fragmentation of directories, which degrades | ||
2912 | * directory performance, therefore we always want to | ||
2913 | * choose the option that produces forward allocation | ||
2914 | * patterns. | ||
2915 | * Preferring the lower bno extent will make the next | ||
2916 | * request use "fend" as the start of the next | ||
2917 | * allocation; if the segment is no longer busy at | ||
2918 | * that point, we'll get a contiguous allocation, but | ||
2919 | * even if it is still busy, we will get a forward | ||
2920 | * allocation. | ||
2921 | * We try to avoid choosing the segment at "bend", | ||
2922 | * because that can lead to the next allocation | ||
2923 | * taking the segment at "fbno", which would be a | ||
2924 | * backward allocation. We only use the segment at | ||
2925 | * "fbno" if it is much larger than the current | ||
2926 | * requested size, because in that case there's a | ||
2927 | * good chance subsequent allocations will be | ||
2928 | * contiguous. | ||
2929 | */ | ||
2930 | if (bbno - fbno >= args->maxlen) { | ||
2931 | /* left candidate fits perfect */ | ||
2932 | fend = bbno; | ||
2933 | } else if (fend - bend >= args->maxlen * 4) { | ||
2934 | /* right candidate has enough free space */ | ||
2935 | fbno = bend; | ||
2936 | } else if (bbno - fbno >= args->minlen) { | ||
2937 | /* left candidate fits minimum requirement */ | ||
2938 | fend = bbno; | ||
2939 | } else { | ||
2940 | goto fail; | ||
2941 | } | ||
2942 | } | ||
2943 | |||
2944 | flen = fend - fbno; | ||
2945 | } | ||
2946 | spin_unlock(&args->pag->pagb_lock); | ||
2947 | |||
2948 | if (fbno != bno || flen != len) { | ||
2949 | trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, | ||
2950 | fbno, flen); | ||
2951 | } | ||
2952 | *rbno = fbno; | ||
2953 | *rlen = flen; | ||
2954 | return; | ||
2955 | fail: | ||
2956 | /* | ||
2957 | * Return a zero extent length as failure indications. All callers | ||
2958 | * re-check if the trimmed extent satisfies the minlen requirement. | ||
2959 | */ | ||
2960 | spin_unlock(&args->pag->pagb_lock); | ||
2961 | trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); | ||
2962 | *rbno = fbno; | ||
2963 | *rlen = 0; | ||
2964 | } | ||
2965 | |||
2966 | static void | ||
2967 | xfs_alloc_busy_clear_one( | ||
2968 | struct xfs_mount *mp, | ||
2969 | struct xfs_perag *pag, | ||
2970 | struct xfs_busy_extent *busyp) | ||
2971 | { | ||
2972 | if (busyp->length) { | ||
2973 | trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, | ||
2974 | busyp->length); | ||
2975 | rb_erase(&busyp->rb_node, &pag->pagb_tree); | ||
2976 | } | ||
2977 | |||
2978 | list_del_init(&busyp->list); | ||
2697 | kmem_free(busyp); | 2979 | kmem_free(busyp); |
2698 | } | 2980 | } |
2981 | |||
2982 | void | ||
2983 | xfs_alloc_busy_clear( | ||
2984 | struct xfs_mount *mp, | ||
2985 | struct list_head *list) | ||
2986 | { | ||
2987 | struct xfs_busy_extent *busyp, *n; | ||
2988 | struct xfs_perag *pag = NULL; | ||
2989 | xfs_agnumber_t agno = NULLAGNUMBER; | ||
2990 | |||
2991 | list_for_each_entry_safe(busyp, n, list, list) { | ||
2992 | if (busyp->agno != agno) { | ||
2993 | if (pag) { | ||
2994 | spin_unlock(&pag->pagb_lock); | ||
2995 | xfs_perag_put(pag); | ||
2996 | } | ||
2997 | pag = xfs_perag_get(mp, busyp->agno); | ||
2998 | spin_lock(&pag->pagb_lock); | ||
2999 | agno = busyp->agno; | ||
3000 | } | ||
3001 | |||
3002 | xfs_alloc_busy_clear_one(mp, pag, busyp); | ||
3003 | } | ||
3004 | |||
3005 | if (pag) { | ||
3006 | spin_unlock(&pag->pagb_lock); | ||
3007 | xfs_perag_put(pag); | ||
3008 | } | ||
3009 | } | ||
3010 | |||
3011 | /* | ||
3012 | * Callback for list_sort to sort busy extents by the AG they reside in. | ||
3013 | */ | ||
3014 | int | ||
3015 | xfs_busy_extent_ag_cmp( | ||
3016 | void *priv, | ||
3017 | struct list_head *a, | ||
3018 | struct list_head *b) | ||
3019 | { | ||
3020 | return container_of(a, struct xfs_busy_extent, list)->agno - | ||
3021 | container_of(b, struct xfs_busy_extent, list)->agno; | ||
3022 | } | ||