aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2007-08-28 00:00:13 -0400
committerTim Shimmin <tes@chook.melbourne.sgi.com>2007-10-15 02:50:50 -0400
commitda353b0d64e070ae7c5342a0d56ec20ae9ef5cfb (patch)
tree84454023d649df67cc6b125c73746ddb341ac34e /fs/xfs/xfs_inode.c
parent39cd9f877e63ce7e02cdc7f5dbf1b908451c9532 (diff)
[XFS] Radix tree based inode caching
One of the perpetual scaling problems XFS has is indexing it's incore inodes. We currently uses hashes and the default hash sizes chosen can only ever be a tradeoff between memory consumption and the maximum realistic size of the cache. As a result, anyone who has millions of inodes cached on a filesystem needs to tunes the size of the cache via the ihashsize mount option to allow decent scalability with inode cache operations. A further problem is the separate inode cluster hash, whose size is based on the ihashsize but is smaller, and so under certain conditions (sparse cluster cache population) this can become a limitation long before the inode hash is causing issues. The following patchset removes the inode hash and cluster hash and replaces them with radix trees to avoid the scalability limitations of the hashes. It also reduces the size of the inodes by 3 pointers.... SGI-PV: 969561 SGI-Modid: xfs-linux-melb:xfs-kern:29481a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c42
1 files changed, 19 insertions, 23 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41a0c73b601a..c1b917bd5951 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -52,7 +52,7 @@
52 52
53kmem_zone_t *xfs_ifork_zone; 53kmem_zone_t *xfs_ifork_zone;
54kmem_zone_t *xfs_inode_zone; 54kmem_zone_t *xfs_inode_zone;
55kmem_zone_t *xfs_chashlist_zone; 55kmem_zone_t *xfs_icluster_zone;
56 56
57/* 57/*
58 * Used in xfs_itruncate(). This is the maximum number of extents 58 * Used in xfs_itruncate(). This is the maximum number of extents
@@ -2182,10 +2182,10 @@ xfs_ifree_cluster(
2182 int i, j, found, pre_flushed; 2182 int i, j, found, pre_flushed;
2183 xfs_daddr_t blkno; 2183 xfs_daddr_t blkno;
2184 xfs_buf_t *bp; 2184 xfs_buf_t *bp;
2185 xfs_ihash_t *ih;
2186 xfs_inode_t *ip, **ip_found; 2185 xfs_inode_t *ip, **ip_found;
2187 xfs_inode_log_item_t *iip; 2186 xfs_inode_log_item_t *iip;
2188 xfs_log_item_t *lip; 2187 xfs_log_item_t *lip;
2188 xfs_perag_t *pag = xfs_get_perag(mp, inum);
2189 SPLDECL(s); 2189 SPLDECL(s);
2190 2190
2191 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { 2191 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
@@ -2220,23 +2220,20 @@ xfs_ifree_cluster(
2220 */ 2220 */
2221 found = 0; 2221 found = 0;
2222 for (i = 0; i < ninodes; i++) { 2222 for (i = 0; i < ninodes; i++) {
2223 ih = XFS_IHASH(mp, inum + i); 2223 read_lock(&pag->pag_ici_lock);
2224 read_lock(&ih->ih_lock); 2224 ip = radix_tree_lookup(&pag->pag_ici_root,
2225 for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { 2225 XFS_INO_TO_AGINO(mp, (inum + i)));
2226 if (ip->i_ino == inum + i)
2227 break;
2228 }
2229 2226
2230 /* Inode not in memory or we found it already, 2227 /* Inode not in memory or we found it already,
2231 * nothing to do 2228 * nothing to do
2232 */ 2229 */
2233 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2230 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
2234 read_unlock(&ih->ih_lock); 2231 read_unlock(&pag->pag_ici_lock);
2235 continue; 2232 continue;
2236 } 2233 }
2237 2234
2238 if (xfs_inode_clean(ip)) { 2235 if (xfs_inode_clean(ip)) {
2239 read_unlock(&ih->ih_lock); 2236 read_unlock(&pag->pag_ici_lock);
2240 continue; 2237 continue;
2241 } 2238 }
2242 2239
@@ -2259,7 +2256,7 @@ xfs_ifree_cluster(
2259 ip_found[found++] = ip; 2256 ip_found[found++] = ip;
2260 } 2257 }
2261 } 2258 }
2262 read_unlock(&ih->ih_lock); 2259 read_unlock(&pag->pag_ici_lock);
2263 continue; 2260 continue;
2264 } 2261 }
2265 2262
@@ -2277,8 +2274,7 @@ xfs_ifree_cluster(
2277 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2274 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2278 } 2275 }
2279 } 2276 }
2280 2277 read_unlock(&pag->pag_ici_lock);
2281 read_unlock(&ih->ih_lock);
2282 } 2278 }
2283 2279
2284 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2280 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
@@ -2333,6 +2329,7 @@ xfs_ifree_cluster(
2333 } 2329 }
2334 2330
2335 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); 2331 kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
2332 xfs_put_perag(mp, pag);
2336} 2333}
2337 2334
2338/* 2335/*
@@ -3050,12 +3047,11 @@ xfs_iflush(
3050 xfs_mount_t *mp; 3047 xfs_mount_t *mp;
3051 int error; 3048 int error;
3052 /* REFERENCED */ 3049 /* REFERENCED */
3053 xfs_chash_t *ch;
3054 xfs_inode_t *iq; 3050 xfs_inode_t *iq;
3055 int clcount; /* count of inodes clustered */ 3051 int clcount; /* count of inodes clustered */
3056 int bufwasdelwri; 3052 int bufwasdelwri;
3053 struct hlist_node *entry;
3057 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3054 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3058 SPLDECL(s);
3059 3055
3060 XFS_STATS_INC(xs_iflush_count); 3056 XFS_STATS_INC(xs_iflush_count);
3061 3057
@@ -3169,14 +3165,14 @@ xfs_iflush(
3169 * inode clustering: 3165 * inode clustering:
3170 * see if other inodes can be gathered into this write 3166 * see if other inodes can be gathered into this write
3171 */ 3167 */
3172 3168 spin_lock(&ip->i_cluster->icl_lock);
3173 ip->i_chash->chl_buf = bp; 3169 ip->i_cluster->icl_buf = bp;
3174
3175 ch = XFS_CHASH(mp, ip->i_blkno);
3176 s = mutex_spinlock(&ch->ch_lock);
3177 3170
3178 clcount = 0; 3171 clcount = 0;
3179 for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { 3172 hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
3173 if (iq == ip)
3174 continue;
3175
3180 /* 3176 /*
3181 * Do an un-protected check to see if the inode is dirty and 3177 * Do an un-protected check to see if the inode is dirty and
3182 * is a candidate for flushing. These checks will be repeated 3178 * is a candidate for flushing. These checks will be repeated
@@ -3227,7 +3223,7 @@ xfs_iflush(
3227 xfs_iunlock(iq, XFS_ILOCK_SHARED); 3223 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3228 } 3224 }
3229 } 3225 }
3230 mutex_spinunlock(&ch->ch_lock, s); 3226 spin_unlock(&ip->i_cluster->icl_lock);
3231 3227
3232 if (clcount) { 3228 if (clcount) {
3233 XFS_STATS_INC(xs_icluster_flushcnt); 3229 XFS_STATS_INC(xs_icluster_flushcnt);
@@ -3264,7 +3260,7 @@ cluster_corrupt_out:
3264 /* Corruption detected in the clustering loop. Invalidate the 3260 /* Corruption detected in the clustering loop. Invalidate the
3265 * inode buffer and shut down the filesystem. 3261 * inode buffer and shut down the filesystem.
3266 */ 3262 */
3267 mutex_spinunlock(&ch->ch_lock, s); 3263 spin_unlock(&ip->i_cluster->icl_lock);
3268 3264
3269 /* 3265 /*
3270 * Clean up the buffer. If it was B_DELWRI, just release it -- 3266 * Clean up the buffer. If it was B_DELWRI, just release it --