aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2007-05-24 01:26:31 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-07-14 01:28:50 -0400
commit92821e2ba4ae26887223326fb0b95cdab963b768 (patch)
treea40a2ef10e5b0791df3e522f3139193d39bf2454 /fs/xfs/xfs_alloc.c
parent3260f78ad6d5b788e78ea709d377f58e569bee41 (diff)
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all typically modify the on disk superblock in some way. create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify free block counts. When these counts are modified in a transaction, they must eventually lock the superblock buffer and apply the mods. The buffer then remains locked until the transaction is committed into the incore log buffer. The result of this is that with enough transactions on the fly the incore superblock buffer becomes a bottleneck. The result of contention on the incore superblock buffer is that transaction rates fall - the more pressure that is put on the superblock buffer, the slower things go. The key to removing the contention is to not require the superblock fields in question to be locked. We do that by not marking the superblock dirty in the transaction. IOWs, we modify the incore superblock but do not modify the cached superblock buffer. In short, we do not log superblock modifications to critical fields in the superblock on every transaction. In fact we only do it just before we write the superblock to disk every sync period or just before unmount. This creates an interesting problem - if we don't log or write out the fields in every transaction, then how do the values get recovered after a crash? the answer is simple - we keep enough duplicate, logged information in other structures that we can reconstruct the correct count after log recovery has been performed. It is the AGF and AGI structures that contain the duplicate information; after recovery, we walk every AGI and AGF and sum their individual counters to get the correct value, and we do a transaction into the log to correct them. An optimisation of this is that if we have a clean unmount record, we know the value in the superblock is correct, so we can avoid the summation walk under normal conditions and so mount/recovery times do not change under normal operation. One wrinkle that was discovered during development was that the blocks used in the freespace btrees are never accounted for in the AGF counters. This was once a valid optimisation to make; when the filesystem is full, the free space btrees are empty and consume no space. Hence when it matters, the "accounting" is correct. But that means the when we do the AGF summations, we would not have a correct count and xfs_check would complain. Hence a new counter was added to track the number of blocks used by the free space btrees. This is an *on-disk format change*. As a result of this, lazy superblock counters are a mkfs option and at the moment on linux there is no way to convert an old filesystem. This is possible - xfs_db can be used to twiddle the right bits and then xfs_repair will do the format conversion for you. Similarly, you can convert backwards as well. At some point we'll add functionality to xfs_admin to do the bit twiddling easily.... SGI-PV: 964999 SGI-Modid: xfs-linux-melb:xfs-kern:28652a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c48
1 files changed, 38 insertions, 10 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8e9a40aa0cd3..98f95d4c4bcc 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1447,7 +1447,8 @@ xfs_alloc_ag_vextent_small(
1447 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && 1447 else if (args->minlen == 1 && args->alignment == 1 && !args->isfl &&
1448 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) 1448 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount)
1449 > args->minleft)) { 1449 > args->minleft)) {
1450 if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) 1450 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1451 if (error)
1451 goto error0; 1452 goto error0;
1452 if (fbno != NULLAGBLOCK) { 1453 if (fbno != NULLAGBLOCK) {
1453 if (args->userdata) { 1454 if (args->userdata) {
@@ -1923,7 +1924,8 @@ xfs_alloc_fix_freelist(
1923 while (be32_to_cpu(agf->agf_flcount) > need) { 1924 while (be32_to_cpu(agf->agf_flcount) > need) {
1924 xfs_buf_t *bp; 1925 xfs_buf_t *bp;
1925 1926
1926 if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) 1927 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
1928 if (error)
1927 return error; 1929 return error;
1928 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) 1930 if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1)))
1929 return error; 1931 return error;
@@ -1973,8 +1975,9 @@ xfs_alloc_fix_freelist(
1973 * Put each allocated block on the list. 1975 * Put each allocated block on the list.
1974 */ 1976 */
1975 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 1977 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
1976 if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, 1978 error = xfs_alloc_put_freelist(tp, agbp,
1977 bno))) 1979 agflbp, bno, 0);
1980 if (error)
1978 return error; 1981 return error;
1979 } 1982 }
1980 } 1983 }
@@ -1991,13 +1994,15 @@ int /* error */
1991xfs_alloc_get_freelist( 1994xfs_alloc_get_freelist(
1992 xfs_trans_t *tp, /* transaction pointer */ 1995 xfs_trans_t *tp, /* transaction pointer */
1993 xfs_buf_t *agbp, /* buffer containing the agf structure */ 1996 xfs_buf_t *agbp, /* buffer containing the agf structure */
1994 xfs_agblock_t *bnop) /* block address retrieved from freelist */ 1997 xfs_agblock_t *bnop, /* block address retrieved from freelist */
1998 int btreeblk) /* destination is a AGF btree */
1995{ 1999{
1996 xfs_agf_t *agf; /* a.g. freespace structure */ 2000 xfs_agf_t *agf; /* a.g. freespace structure */
1997 xfs_agfl_t *agfl; /* a.g. freelist structure */ 2001 xfs_agfl_t *agfl; /* a.g. freelist structure */
1998 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ 2002 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
1999 xfs_agblock_t bno; /* block number returned */ 2003 xfs_agblock_t bno; /* block number returned */
2000 int error; 2004 int error;
2005 int logflags;
2001#ifdef XFS_ALLOC_TRACE 2006#ifdef XFS_ALLOC_TRACE
2002 static char fname[] = "xfs_alloc_get_freelist"; 2007 static char fname[] = "xfs_alloc_get_freelist";
2003#endif 2008#endif
@@ -2032,8 +2037,16 @@ xfs_alloc_get_freelist(
2032 be32_add(&agf->agf_flcount, -1); 2037 be32_add(&agf->agf_flcount, -1);
2033 xfs_trans_agflist_delta(tp, -1); 2038 xfs_trans_agflist_delta(tp, -1);
2034 pag->pagf_flcount--; 2039 pag->pagf_flcount--;
2035 TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); 2040
2036 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); 2041 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2042 if (btreeblk) {
2043 be32_add(&agf->agf_btreeblks, 1);
2044 pag->pagf_btreeblks++;
2045 logflags |= XFS_AGF_BTREEBLKS;
2046 }
2047
2048 TRACE_MODAGF(NULL, agf, logflags);
2049 xfs_alloc_log_agf(tp, agbp, logflags);
2037 *bnop = bno; 2050 *bnop = bno;
2038 2051
2039 /* 2052 /*
@@ -2071,6 +2084,7 @@ xfs_alloc_log_agf(
2071 offsetof(xfs_agf_t, agf_flcount), 2084 offsetof(xfs_agf_t, agf_flcount),
2072 offsetof(xfs_agf_t, agf_freeblks), 2085 offsetof(xfs_agf_t, agf_freeblks),
2073 offsetof(xfs_agf_t, agf_longest), 2086 offsetof(xfs_agf_t, agf_longest),
2087 offsetof(xfs_agf_t, agf_btreeblks),
2074 sizeof(xfs_agf_t) 2088 sizeof(xfs_agf_t)
2075 }; 2089 };
2076 2090
@@ -2106,12 +2120,14 @@ xfs_alloc_put_freelist(
2106 xfs_trans_t *tp, /* transaction pointer */ 2120 xfs_trans_t *tp, /* transaction pointer */
2107 xfs_buf_t *agbp, /* buffer for a.g. freelist header */ 2121 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2108 xfs_buf_t *agflbp,/* buffer for a.g. free block array */ 2122 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2109 xfs_agblock_t bno) /* block being freed */ 2123 xfs_agblock_t bno, /* block being freed */
2124 int btreeblk) /* block came from a AGF btree */
2110{ 2125{
2111 xfs_agf_t *agf; /* a.g. freespace structure */ 2126 xfs_agf_t *agf; /* a.g. freespace structure */
2112 xfs_agfl_t *agfl; /* a.g. free block array */ 2127 xfs_agfl_t *agfl; /* a.g. free block array */
2113 __be32 *blockp;/* pointer to array entry */ 2128 __be32 *blockp;/* pointer to array entry */
2114 int error; 2129 int error;
2130 int logflags;
2115#ifdef XFS_ALLOC_TRACE 2131#ifdef XFS_ALLOC_TRACE
2116 static char fname[] = "xfs_alloc_put_freelist"; 2132 static char fname[] = "xfs_alloc_put_freelist";
2117#endif 2133#endif
@@ -2132,11 +2148,22 @@ xfs_alloc_put_freelist(
2132 be32_add(&agf->agf_flcount, 1); 2148 be32_add(&agf->agf_flcount, 1);
2133 xfs_trans_agflist_delta(tp, 1); 2149 xfs_trans_agflist_delta(tp, 1);
2134 pag->pagf_flcount++; 2150 pag->pagf_flcount++;
2151
2152 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2153 if (btreeblk) {
2154 be32_add(&agf->agf_btreeblks, -1);
2155 pag->pagf_btreeblks--;
2156 logflags |= XFS_AGF_BTREEBLKS;
2157 }
2158
2159 TRACE_MODAGF(NULL, agf, logflags);
2160 xfs_alloc_log_agf(tp, agbp, logflags);
2161
2135 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); 2162 ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp));
2136 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2163 blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)];
2137 *blockp = cpu_to_be32(bno); 2164 *blockp = cpu_to_be32(bno);
2138 TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); 2165 TRACE_MODAGF(NULL, agf, logflags);
2139 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); 2166 xfs_alloc_log_agf(tp, agbp, logflags);
2140 xfs_trans_log_buf(tp, agflbp, 2167 xfs_trans_log_buf(tp, agflbp,
2141 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), 2168 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl),
2142 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + 2169 (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl +
@@ -2196,6 +2223,7 @@ xfs_alloc_read_agf(
2196 pag = &mp->m_perag[agno]; 2223 pag = &mp->m_perag[agno];
2197 if (!pag->pagf_init) { 2224 if (!pag->pagf_init) {
2198 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 2225 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
2226 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
2199 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 2227 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
2200 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 2228 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2201 pag->pagf_levels[XFS_BTNUM_BNOi] = 2229 pag->pagf_levels[XFS_BTNUM_BNOi] =