[XFS] Radix tree based inode caching

One of the perpetual scaling problems XFS has is indexing it's incore inodes. We currently uses hashes and the default hash sizes chosen can only ever be a tradeoff between memory consumption and the maximum realistic size of the cache. As a result, anyone who has millions of inodes cached on a filesystem needs to tunes the size of the cache via the ihashsize mount option to allow decent scalability with inode cache operations. A further problem is the separate inode cluster hash, whose size is based on the ihashsize but is smaller, and so under certain conditions (sparse cluster cache population) this can become a limitation long before the inode hash is causing issues. The following patchset removes the inode hash and cluster hash and replaces them with radix trees to avoid the scalability limitations of the hashes. It also reduces the size of the inodes by 3 pointers.... SGI-PV: 969561 SGI-Modid: xfs-linux-melb:xfs-kern:29481a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Christoph Hellwig <hch@infradead.org> Signed-off-by: Tim Shimmin <tes@sgi.com>
author: David Chinner <dgc@sgi.com> 2007-08-28 00:00:13 -0400
committer: Tim Shimmin <tes@chook.melbourne.sgi.com> 2007-10-15 02:50:50 -0400
commit: da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb (patch)
tree: 84454023d649df67cc6b125c73746ddb341ac34e /fs/xfs/xfs_iget.c
parent: 39cd9f877e63ce7e02cdc7f5dbf1b908451c9532 (diff)
1 files changed, 195 insertions, 390 deletions
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 114433a22baa..e07dcc1b70a6 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -40,131 +40,13 @@
 #include "xfs_utils.h"
 /*
- * Initialize the inode hash table for the newly mounted file system.
- * Choose an initial table size based on user specified value, else
- * use a simple algorithm using the maximum number of inodes as an
- * indicator for table size, and clamp it between one and some large
- * number of pages.
- */
-void
-xfs_ihash_init(xfs_mount_t *mp)
-{
-        __uint64_t      icount;
-        uint            i;
-        if (!mp->m_ihsize) {
-                icount = mp->m_maxicount ? mp->m_maxicount :
-                         (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
-                mp->m_ihsize = 1 << max_t(uint, 8,
-                                        (xfs_highbit64(icount) + 1) / 2);
-                mp->m_ihsize = min_t(uint, mp->m_ihsize,
-                                        (64 * NBPP) / sizeof(xfs_ihash_t));
-        }
-        mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
-                                         NBPC * sizeof(xfs_ihash_t),
-                                         mp->m_ihsize * sizeof(xfs_ihash_t),
-                                         KM_SLEEP | KM_MAYFAIL | KM_LARGE);
-        mp->m_ihsize /= sizeof(xfs_ihash_t);
-        for (i = 0; i < mp->m_ihsize; i++)
-                rwlock_init(&(mp->m_ihash[i].ih_lock));
-}
-/*
- * Free up structures allocated by xfs_ihash_init, at unmount time.
- */
-void
-xfs_ihash_free(xfs_mount_t *mp)
-{
-        kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
-        mp->m_ihash = NULL;
-}
-/*
- * Initialize the inode cluster hash table for the newly mounted file system.
- * Its size is derived from the ihash table size.
- */
-void
-xfs_chash_init(xfs_mount_t *mp)
-{
-        uint    i;
-        mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
-                         (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
-        mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
-        mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
-                                                 * sizeof(xfs_chash_t),
-                                                 KM_SLEEP | KM_LARGE);
-        for (i = 0; i < mp->m_chsize; i++) {
-                spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
-        }
-}
-/*
- * Free up structures allocated by xfs_chash_init, at unmount time.
- */
-void
-xfs_chash_free(xfs_mount_t *mp)
-{
-        int     i;
-        for (i = 0; i < mp->m_chsize; i++) {
-                spinlock_destroy(&mp->m_chash[i].ch_lock);
-        }
-        kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
-        mp->m_chash = NULL;
-}
-/*
- * Try to move an inode to the front of its hash list if possible
- * (and if its not there already).  Called right after obtaining
- * the list version number and then dropping the read_lock on the
- * hash list in question (which is done right after looking up the
- * inode in question...).
- */
-STATIC void
-xfs_ihash_promote(
-        xfs_ihash_t     *ih,
-        xfs_inode_t     *ip,
-        ulong           version)
-{
-        xfs_inode_t     *iq;
-        if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) {
-                if (likely(version == ih->ih_version)) {
-                        /* remove from list */
-                        if ((iq = ip->i_next)) {
-                                iq->i_prevp = ip->i_prevp;
-                        }
-                        *ip->i_prevp = iq;
-                        /* insert at list head */
-                        iq = ih->ih_next;
-                        iq->i_prevp = &ip->i_next;
-                        ip->i_next = iq;
-                        ip->i_prevp = &ih->ih_next;
-                        ih->ih_next = ip;
-                }
-                write_unlock(&ih->ih_lock);
-        }
-}
-/*
 * Look up an inode by number in the given file system.
- * The inode is looked up in the hash table for the file system
+ * The inode is looked up in the cache held in each AG.
- * represented by the mount point parameter mp.  Each bucket of
+ * If the inode is found in the cache, attach it to the provided
- * the hash table is guarded by an individual semaphore.
+ * vnode.
- *
- * If the inode is found in the hash table, its corresponding vnode
- * is obtained with a call to vn_get().  This call takes care of
- * coordination with the reclamation of the inode and vnode.  Note
- * that the vmap structure is filled in while holding the hash lock.
- * This gives us the state of the inode/vnode when we found it and
- * is used for coordination in vn_get().
 *
- * If it is not in core, read it in from the file system's device and
+ * If it is not in core, read it in from the file system's device,
- * add the inode into the hash table.
+ * add it to the cache and attach the provided vnode.
 *
 * The inode is locked according to the value of the lock_flags parameter.
 * This flag parameter indicates how and if the inode's IO lock and inode lock
@@ -192,274 +74,241 @@ xfs_iget_core(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-        xfs_ihash_t     *ih;
        xfs_inode_t     *ip;
        xfs_inode_t     *iq;
        bhv_vnode_t     *inode_vp;
-        ulong           version;
        int             error;
-        /* REFERENCED */
+        xfs_icluster_t  *icl, *new_icl = NULL;
-        xfs_chash_t     *ch;
+        unsigned long   first_index, mask;
-        xfs_chashlist_t *chl, *chlnew;
+        xfs_perag_t     *pag;
-        SPLDECL(s);
+        xfs_agino_t     agino;
+        /* the radix tree exists only in inode capable AGs */
+        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+                return EINVAL;
+        /* get the perag structure and ensure that it's inode capable */
+        pag = xfs_get_perag(mp, ino);
+        if (!pag->pagi_inodeok)
+                return EINVAL;
+        ASSERT(pag->pag_ici_init);
+        agino = XFS_INO_TO_AGINO(mp, ino);
+again:
+        read_lock(&pag->pag_ici_lock);
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-        ih = XFS_IHASH(mp, ino);
+        if (ip != NULL) {
+                /*
+                 * If INEW is set this inode is being set up
+                 * we need to pause and try again.
+                 */
+                if (xfs_iflags_test(ip, XFS_INEW)) {
+                        read_unlock(&pag->pag_ici_lock);
+                        delay(1);
+                        XFS_STATS_INC(xs_ig_frecycle);
-again:
+                        goto again;
-        read_lock(&ih->ih_lock);
+                }
-        for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+                inode_vp = XFS_ITOV_NULL(ip);
-                if (ip->i_ino == ino) {
+                if (inode_vp == NULL) {
                        /*
-                         * If INEW is set this inode is being set up
+                         * If IRECLAIM is set this inode is
+                         * on its way out of the system,
                         * we need to pause and try again.
                         */
-                        if (xfs_iflags_test(ip, XFS_INEW)) {
+                        if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                                read_unlock(&ih->ih_lock);
+                                read_unlock(&pag->pag_ici_lock);
                                delay(1);
                                XFS_STATS_INC(xs_ig_frecycle);
                                goto again;
                        }
+                        ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                        inode_vp = XFS_ITOV_NULL(ip);
+                        /*
-                        if (inode_vp == NULL) {
+                         * If lookup is racing with unlink, then we
-                                /*
+                         * should return an error immediately so we
-                                 * If IRECLAIM is set this inode is
+                         * don't remove it from the reclaim list and
-                                 * on its way out of the system,
+                         * potentially leak the inode.
-                                 * we need to pause and try again.
+                         */
-                                 */
+                        if ((ip->i_d.di_mode == 0) &&
-                                if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                            !(flags & XFS_IGET_CREATE)) {
-                                        read_unlock(&ih->ih_lock);
+                                read_unlock(&pag->pag_ici_lock);
-                                        delay(1);
+                                xfs_put_perag(mp, pag);
-                                        XFS_STATS_INC(xs_ig_frecycle);
+                                return ENOENT;
+                        }
-                                        goto again;
-                                }
-                                ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-                                /*
-                                 * If lookup is racing with unlink, then we
-                                 * should return an error immediately so we
-                                 * don't remove it from the reclaim list and
-                                 * potentially leak the inode.
-                                 */
-                                if ((ip->i_d.di_mode == 0) &&
-                                    !(flags & XFS_IGET_CREATE)) {
-                                        read_unlock(&ih->ih_lock);
-                                        return ENOENT;
-                                }
-                                /*
-                                 * There may be transactions sitting in the
-                                 * incore log buffers or being flushed to disk
-                                 * at this time.  We can't clear the
-                                 * XFS_IRECLAIMABLE flag until these
-                                 * transactions have hit the disk, otherwise we
-                                 * will void the guarantee the flag provides
-                                 * xfs_iunpin()
-                                 */
-                                if (xfs_ipincount(ip)) {
-                                        read_unlock(&ih->ih_lock);
-                                        xfs_log_force(mp, 0,
-                                                XFS_LOG_FORCE|XFS_LOG_SYNC);
-                                        XFS_STATS_INC(xs_ig_frecycle);
-                                        goto again;
-                                }
-                                vn_trace_exit(vp, "xfs_iget.alloc",
-                                        (inst_t *)__return_address);
-                                XFS_STATS_INC(xs_ig_found);
+                        /*
+                         * There may be transactions sitting in the
+                         * incore log buffers or being flushed to disk
+                         * at this time.  We can't clear the
+                         * XFS_IRECLAIMABLE flag until these
+                         * transactions have hit the disk, otherwise we
+                         * will void the guarantee the flag provides
+                         * xfs_iunpin()
+                         */
+                        if (xfs_ipincount(ip)) {
+                                read_unlock(&pag->pag_ici_lock);
+                                xfs_log_force(mp, 0,
+                                        XFS_LOG_FORCE|XFS_LOG_SYNC);
+                                XFS_STATS_INC(xs_ig_frecycle);
+                                goto again;
+                        }
-                                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+                        vn_trace_exit(vp, "xfs_iget.alloc",
-                                version = ih->ih_version;
+                                (inst_t *)__return_address);
-                                read_unlock(&ih->ih_lock);
-                                xfs_ihash_promote(ih, ip, version);
-                                XFS_MOUNT_ILOCK(mp);
+                        XFS_STATS_INC(xs_ig_found);
-                                list_del_init(&ip->i_reclaim);
-                                XFS_MOUNT_IUNLOCK(mp);
-                                goto finish_inode;
+                        xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+                        read_unlock(&pag->pag_ici_lock);
-                        } else if (vp != inode_vp) {
+                        XFS_MOUNT_ILOCK(mp);
-                                struct inode *inode = vn_to_inode(inode_vp);
+                        list_del_init(&ip->i_reclaim);
+                        XFS_MOUNT_IUNLOCK(mp);
-                                /* The inode is being torn down, pause and
+                        goto finish_inode;
-                                 * try again.
-                                 */
-                                if (inode->i_state & (I_FREEING | I_CLEAR)) {
-                                        read_unlock(&ih->ih_lock);
-                                        delay(1);
-                                        XFS_STATS_INC(xs_ig_frecycle);
-                                        goto again;
+                } else if (vp != inode_vp) {
-                                }
+                        struct inode *inode = vn_to_inode(inode_vp);
-/* Chances are the other vnode (the one in the inode) is being torn
- * down right now, and we landed on top of it. Question is, what do
- * we do? Unhook the old inode and hook up the new one?
- */
-                                cmn_err(CE_PANIC,
-                        "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                                inode_vp, vp);
-                        }
-                        /*
+                        /* The inode is being torn down, pause and
-                         * Inode cache hit: if ip is not at the front of
+                         * try again.
-                         * its hash chain, move it there now.
-                         * Do this with the lock held for update, but
-                         * do statistics after releasing the lock.
                         */
-                        version = ih->ih_version;
+                        if (inode->i_state & (I_FREEING | I_CLEAR)) {
-                        read_unlock(&ih->ih_lock);
+                                read_unlock(&pag->pag_ici_lock);
-                        xfs_ihash_promote(ih, ip, version);
+                                delay(1);
-                        XFS_STATS_INC(xs_ig_found);
+                                XFS_STATS_INC(xs_ig_frecycle);
-finish_inode:
+                                goto again;
-                        if (ip->i_d.di_mode == 0) {
-                                if (!(flags & XFS_IGET_CREATE))
-                                        return ENOENT;
-                                xfs_iocore_inode_reinit(ip);
                        }
+/* Chances are the other vnode (the one in the inode) is being torn
+* down right now, and we landed on top of it. Question is, what do
+* we do? Unhook the old inode and hook up the new one?
+*/
+                        cmn_err(CE_PANIC,
+                "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
+                                        inode_vp, vp);
+                }
-                        if (lock_flags != 0)
+                /*
-                                xfs_ilock(ip, lock_flags);
+                 * Inode cache hit
+                 */
+                read_unlock(&pag->pag_ici_lock);
+                XFS_STATS_INC(xs_ig_found);
-                        xfs_iflags_clear(ip, XFS_ISTALE);
+finish_inode:
-                        vn_trace_exit(vp, "xfs_iget.found",
+                if (ip->i_d.di_mode == 0) {
-                                                (inst_t *)__return_address);
+                        if (!(flags & XFS_IGET_CREATE)) {
-                        goto return_ip;
+                                xfs_put_perag(mp, pag);
+                                return ENOENT;
+                        }
+                        xfs_iocore_inode_reinit(ip);
                }
+                if (lock_flags != 0)
+                        xfs_ilock(ip, lock_flags);
+                xfs_iflags_clear(ip, XFS_ISTALE);
+                vn_trace_exit(vp, "xfs_iget.found",
+                                        (inst_t *)__return_address);
+                goto return_ip;
        }
        /*
-         * Inode cache miss: save the hash chain version stamp and unlock
+         * Inode cache miss
-         * the chain, so we don't deadlock in vn_alloc.
         */
+        read_unlock(&pag->pag_ici_lock);
        XFS_STATS_INC(xs_ig_missed);
-        version = ih->ih_version;
-        read_unlock(&ih->ih_lock);
        /*
         * Read the disk inode attributes into a new inode structure and get
         * a new vnode for it. This should also initialize i_ino and i_mount.
         */
        error = xfs_iread(mp, tp, ino, &ip, bno,
                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-        if (error)
+        if (error) {
+                xfs_put_perag(mp, pag);
                return error;
+        }
        vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
        xfs_inode_lock_init(ip, vp);
        xfs_iocore_inode_init(ip);
        if (lock_flags)
                xfs_ilock(ip, lock_flags);
        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
                xfs_idestroy(ip);
+                xfs_put_perag(mp, pag);
                return ENOENT;
        }
        /*
-         * Put ip on its hash chain, unless someone else hashed a duplicate
+         * This is a bit messy - we preallocate everything we _might_
-         * after we released the hash lock.
+         * need before we pick up the ici lock. That way we don't have to
+         * juggle locks and go all the way back to the start.
         */
-        write_lock(&ih->ih_lock);
+        new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
+        if (radix_tree_preload(GFP_KERNEL)) {
+                delay(1);
+                goto again;
+        }
+        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+        first_index = agino & mask;
+        write_lock(&pag->pag_ici_lock);
-        if (ih->ih_version != version) {
+        /*
-                for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {
+         * Find the cluster if it exists
-                        if (iq->i_ino == ino) {
+         */
-                                write_unlock(&ih->ih_lock);
+        icl = NULL;
-                                xfs_idestroy(ip);
+        if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
+                                                        first_index, 1)) {
+                if ((iq->i_ino & mask) == first_index)
+                        icl = iq->i_cluster;
+        }
-                                XFS_STATS_INC(xs_ig_dup);
+        /*
-                                goto again;
+         * insert the new inode
-                        }
+         */
-                }
+        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+        if (unlikely(error)) {
+                BUG_ON(error != -EEXIST);
+                write_unlock(&pag->pag_ici_lock);
+                radix_tree_preload_end();
+                xfs_idestroy(ip);
+                XFS_STATS_INC(xs_ig_dup);
+                goto again;
        }
        /*
         * These values _must_ be set before releasing ihlock!
         */
-        ip->i_hash = ih;
-        if ((iq = ih->ih_next)) {
-                iq->i_prevp = &ip->i_next;
-        }
-        ip->i_next = iq;
-        ip->i_prevp = &ih->ih_next;
-        ih->ih_next = ip;
        ip->i_udquot = ip->i_gdquot = NULL;
-        ih->ih_version++;
        xfs_iflags_set(ip, XFS_INEW);
-        write_unlock(&ih->ih_lock);
-        /*
+        ASSERT(ip->i_cluster == NULL);
-         * put ip on its cluster's hash chain
-         */
-        ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
-               ip->i_cnext == NULL);
-        chlnew = NULL;
-        ch = XFS_CHASH(mp, ip->i_blkno);
- chlredo:
-        s = mutex_spinlock(&ch->ch_lock);
-        for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
-                if (chl->chl_blkno == ip->i_blkno) {
-                        /* insert this inode into the doubly-linked list
-                         * where chl points */
-                        if ((iq = chl->chl_ip)) {
-                                ip->i_cprev = iq->i_cprev;
-                                iq->i_cprev->i_cnext = ip;
-                                iq->i_cprev = ip;
-                                ip->i_cnext = iq;
-                        } else {
-                                ip->i_cnext = ip;
-                                ip->i_cprev = ip;
-                        }
-                        chl->chl_ip = ip;
-                        ip->i_chash = chl;
-                        break;
-                }
-        }
-        /* no hash list found for this block; add a new hash list */
+        if (!icl) {
-        if (chl == NULL)  {
+                spin_lock_init(&new_icl->icl_lock);
-                if (chlnew == NULL) {
+                INIT_HLIST_HEAD(&new_icl->icl_inodes);
-                        mutex_spinunlock(&ch->ch_lock, s);
+                icl = new_icl;
-                        ASSERT(xfs_chashlist_zone != NULL);
+                new_icl = NULL;
-                        chlnew = (xfs_chashlist_t *)
-                                        kmem_zone_alloc(xfs_chashlist_zone,
-                                                KM_SLEEP);
-                        ASSERT(chlnew != NULL);
-                        goto chlredo;
-                } else {
-                        ip->i_cnext = ip;
-                        ip->i_cprev = ip;
-                        ip->i_chash = chlnew;
-                        chlnew->chl_ip = ip;
-                        chlnew->chl_blkno = ip->i_blkno;
-                        if (ch->ch_list)
-                                ch->ch_list->chl_prev = chlnew;
-                        chlnew->chl_next = ch->ch_list;
-                        chlnew->chl_prev = NULL;
-                        ch->ch_list = chlnew;
-                        chlnew = NULL;
-                }
        } else {
-                if (chlnew != NULL) {
+                ASSERT(!hlist_empty(&icl->icl_inodes));
-                        kmem_zone_free(xfs_chashlist_zone, chlnew);
-                }
        }
+        spin_lock(&icl->icl_lock);
+        hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
+        ip->i_cluster = icl;
+        spin_unlock(&icl->icl_lock);
-        mutex_spinunlock(&ch->ch_lock, s);
+        write_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+        if (new_icl)
+                kmem_zone_free(xfs_icluster_zone, new_icl);
        /*
         * Link ip to its mount and thread it on the mount's inode list.
@@ -478,6 +327,7 @@ finish_inode:
        mp->m_inodes = ip;
        XFS_MOUNT_IUNLOCK(mp);
+        xfs_put_perag(mp, pag);
 return_ip:
        ASSERT(ip->i_df.if_ext_max ==
@@ -587,32 +437,19 @@ xfs_inode_incore(xfs_mount_t	*mp,
                 xfs_ino_t      ino,
                 xfs_trans_t    *tp)
 {
-        xfs_ihash_t     *ih;
        xfs_inode_t     *ip;
-        ulong           version;
+        xfs_perag_t     *pag;
-        ih = XFS_IHASH(mp, ino);
+        pag = xfs_get_perag(mp, ino);
-        read_lock(&ih->ih_lock);
+        read_lock(&pag->pag_ici_lock);
-        for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
+        ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
-                if (ip->i_ino == ino) {
+        read_unlock(&pag->pag_ici_lock);
-                        /*
+        xfs_put_perag(mp, pag);
-                         * If we find it and tp matches, return it.
-                         * Also move it to the front of the hash list
+        /* the returned inode must match the transaction */
-                         * if we find it and it is not already there.
+        if (ip && (ip->i_transp != tp))
-                         * Otherwise break from the loop and return
+                return NULL;
-                         * NULL.
+        return ip;
-                         */
-                        if (ip->i_transp == tp) {
-                                version = ih->ih_version;
-                                read_unlock(&ih->ih_lock);
-                                xfs_ihash_promote(ih, ip, version);
-                                return (ip);
-                        }
-                        break;
-                }
-        }
-        read_unlock(&ih->ih_lock);
-        return (NULL);
 }
 /*
@@ -718,58 +555,26 @@ void
 xfs_iextract(
        xfs_inode_t     *ip)
 {
-        xfs_ihash_t     *ih;
+        xfs_mount_t     *mp = ip->i_mount;
+        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
        xfs_inode_t     *iq;
-        xfs_mount_t     *mp;
-        xfs_chash_t     *ch;
+        write_lock(&pag->pag_ici_lock);
-        xfs_chashlist_t *chl, *chm;
+        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
-        SPLDECL(s);
+        write_unlock(&pag->pag_ici_lock);
+        xfs_put_perag(mp, pag);
-        ih = ip->i_hash;
-        write_lock(&ih->ih_lock);
-        if ((iq = ip->i_next)) {
-                iq->i_prevp = ip->i_prevp;
-        }
-        *ip->i_prevp = iq;
-        ih->ih_version++;
-        write_unlock(&ih->ih_lock);
        /*
-         * Remove from cluster hash list
+         * Remove from cluster list
-         *   1) delete the chashlist if this is the last inode on the chashlist
-         *   2) unchain from list of inodes
-         *   3) point chashlist->chl_ip to 'chl_next' if to this inode.
         */
        mp = ip->i_mount;
-        ch = XFS_CHASH(mp, ip->i_blkno);
+        spin_lock(&ip->i_cluster->icl_lock);
-        s = mutex_spinlock(&ch->ch_lock);
+        hlist_del(&ip->i_cnode);
+        spin_unlock(&ip->i_cluster->icl_lock);
-        if (ip->i_cnext == ip) {
-                /* Last inode on chashlist */
+        /* was last inode in cluster? */
-                ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
+        if (hlist_empty(&ip->i_cluster->icl_inodes))
-                ASSERT(ip->i_chash != NULL);
+                kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
-                chm=NULL;
-                chl = ip->i_chash;
-                if (chl->chl_prev)
-                        chl->chl_prev->chl_next = chl->chl_next;
-                else
-                        ch->ch_list = chl->chl_next;
-                if (chl->chl_next)
-                        chl->chl_next->chl_prev = chl->chl_prev;
-                kmem_zone_free(xfs_chashlist_zone, chl);
-        } else {
-                /* delete one inode from a non-empty list */
-                iq = ip->i_cnext;
-                iq->i_cprev = ip->i_cprev;
-                ip->i_cprev->i_cnext = iq;
-                if (ip->i_chash->chl_ip == ip) {
-                        ip->i_chash->chl_ip = iq;
-                }
-                ip->i_chash = __return_address;
-                ip->i_cprev = __return_address;
-                ip->i_cnext = __return_address;
-        }
-        mutex_spinunlock(&ch->ch_lock, s);
        /*
         * Remove from mount's inode list.
author	David Chinner <dgc@sgi.com>	2007-08-28 00:00:13 -0400
committer	Tim Shimmin <tes@chook.melbourne.sgi.com>	2007-10-15 02:50:50 -0400
commit	da353b0d64e070ae7c5342a0d56ec20ae9ef5cfb (patch)
tree	84454023d649df67cc6b125c73746ddb341ac34e /fs/xfs/xfs_iget.c
parent	39cd9f877e63ce7e02cdc7f5dbf1b908451c9532 (diff)

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 114433a22baa..e07dcc1b70a6 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c
@@ -40,131 +40,13 @@
40	#include "xfs_utils.h"	40	#include "xfs_utils.h"
41		41
42	/*	42	/*
43	* Initialize the inode hash table for the newly mounted file system.
44	* Choose an initial table size based on user specified value, else
45	* use a simple algorithm using the maximum number of inodes as an
46	* indicator for table size, and clamp it between one and some large
47	* number of pages.
48	*/
49	void
50	xfs_ihash_init(xfs_mount_t *mp)
51	{
52	__uint64_t icount;
53	uint i;
54
55	if (!mp->m_ihsize) {
56	icount = mp->m_maxicount ? mp->m_maxicount :
57	(mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
58	mp->m_ihsize = 1 << max_t(uint, 8,
59	(xfs_highbit64(icount) + 1) / 2);
60	mp->m_ihsize = min_t(uint, mp->m_ihsize,
61	(64 * NBPP) / sizeof(xfs_ihash_t));
62	}
63
64	mp->m_ihash = kmem_zalloc_greedy(&mp->m_ihsize,
65	NBPC * sizeof(xfs_ihash_t),
66	mp->m_ihsize * sizeof(xfs_ihash_t),
67	KM_SLEEP \| KM_MAYFAIL \| KM_LARGE);
68	mp->m_ihsize /= sizeof(xfs_ihash_t);
69	for (i = 0; i < mp->m_ihsize; i++)
70	rwlock_init(&(mp->m_ihash[i].ih_lock));
71	}
72
73	/*
74	* Free up structures allocated by xfs_ihash_init, at unmount time.
75	*/
76	void
77	xfs_ihash_free(xfs_mount_t *mp)
78	{
79	kmem_free(mp->m_ihash, mp->m_ihsize * sizeof(xfs_ihash_t));
80	mp->m_ihash = NULL;
81	}
82
83	/*
84	* Initialize the inode cluster hash table for the newly mounted file system.
85	* Its size is derived from the ihash table size.
86	*/
87	void
88	xfs_chash_init(xfs_mount_t *mp)
89	{
90	uint i;
91
92	mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
93	(XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
94	mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
95	mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
96	* sizeof(xfs_chash_t),
97	KM_SLEEP \| KM_LARGE);
98	for (i = 0; i < mp->m_chsize; i++) {
99	spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
100	}
101	}
102
103	/*
104	* Free up structures allocated by xfs_chash_init, at unmount time.
105	*/
106	void
107	xfs_chash_free(xfs_mount_t *mp)
108	{
109	int i;
110
111	for (i = 0; i < mp->m_chsize; i++) {
112	spinlock_destroy(&mp->m_chash[i].ch_lock);
113	}
114
115	kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t));
116	mp->m_chash = NULL;
117	}
118
119	/*
120	* Try to move an inode to the front of its hash list if possible
121	* (and if its not there already). Called right after obtaining
122	* the list version number and then dropping the read_lock on the
123	* hash list in question (which is done right after looking up the
124	* inode in question...).
125	*/
126	STATIC void
127	xfs_ihash_promote(
128	xfs_ihash_t *ih,
129	xfs_inode_t *ip,
130	ulong version)
131	{
132	xfs_inode_t *iq;
133
134	if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) {
135	if (likely(version == ih->ih_version)) {
136	/* remove from list */
137	if ((iq = ip->i_next)) {
138	iq->i_prevp = ip->i_prevp;
139	}
140	*ip->i_prevp = iq;
141
142	/* insert at list head */
143	iq = ih->ih_next;
144	iq->i_prevp = &ip->i_next;
145	ip->i_next = iq;
146	ip->i_prevp = &ih->ih_next;
147	ih->ih_next = ip;
148	}
149	write_unlock(&ih->ih_lock);
150	}
151	}
152
153	/*
154	* Look up an inode by number in the given file system.	43	* Look up an inode by number in the given file system.
155	* The inode is looked up in the hash table for the file system	44	* The inode is looked up in the cache held in each AG.
156	* represented by the mount point parameter mp. Each bucket of	45	* If the inode is found in the cache, attach it to the provided
157	* the hash table is guarded by an individual semaphore.	46	* vnode.
158	*
159	* If the inode is found in the hash table, its corresponding vnode
160	* is obtained with a call to vn_get(). This call takes care of
161	* coordination with the reclamation of the inode and vnode. Note
162	* that the vmap structure is filled in while holding the hash lock.
163	* This gives us the state of the inode/vnode when we found it and
164	* is used for coordination in vn_get().
165	*	47	*
166	* If it is not in core, read it in from the file system's device and	48	* If it is not in core, read it in from the file system's device,
167	* add the inode into the hash table.	49	* add it to the cache and attach the provided vnode.
168	*	50	*
169	* The inode is locked according to the value of the lock_flags parameter.	51	* The inode is locked according to the value of the lock_flags parameter.
170	* This flag parameter indicates how and if the inode's IO lock and inode lock	52	* This flag parameter indicates how and if the inode's IO lock and inode lock
@@ -192,274 +74,241 @@ xfs_iget_core(
192	xfs_inode_t **ipp,	74	xfs_inode_t **ipp,
193	xfs_daddr_t bno)	75	xfs_daddr_t bno)
194	{	76	{
195	xfs_ihash_t *ih;
196	xfs_inode_t *ip;	77	xfs_inode_t *ip;
197	xfs_inode_t *iq;	78	xfs_inode_t *iq;
198	bhv_vnode_t *inode_vp;	79	bhv_vnode_t *inode_vp;
199	ulong version;
200	int error;	80	int error;
201	/* REFERENCED */	81	xfs_icluster_t icl, new_icl = NULL;
202	xfs_chash_t *ch;	82	unsigned long first_index, mask;
203	xfs_chashlist_t chl, chlnew;	83	xfs_perag_t *pag;
204	SPLDECL(s);	84	xfs_agino_t agino;
		85
		86	/* the radix tree exists only in inode capable AGs */
		87	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
		88	return EINVAL;
		89
		90	/* get the perag structure and ensure that it's inode capable */
		91	pag = xfs_get_perag(mp, ino);
		92	if (!pag->pagi_inodeok)
		93	return EINVAL;
		94	ASSERT(pag->pag_ici_init);
		95	agino = XFS_INO_TO_AGINO(mp, ino);
205		96
		97	again:
		98	read_lock(&pag->pag_ici_lock);
		99	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
206		100
207	ih = XFS_IHASH(mp, ino);	101	if (ip != NULL) {
		102	/*
		103	* If INEW is set this inode is being set up
		104	* we need to pause and try again.
		105	*/
		106	if (xfs_iflags_test(ip, XFS_INEW)) {
		107	read_unlock(&pag->pag_ici_lock);
		108	delay(1);
		109	XFS_STATS_INC(xs_ig_frecycle);
208		110
209	again:	111	goto again;
210	read_lock(&ih->ih_lock);	112	}
211		113
212	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {	114	inode_vp = XFS_ITOV_NULL(ip);
213	if (ip->i_ino == ino) {	115	if (inode_vp == NULL) {
214	/*	116	/*
215	* If INEW is set this inode is being set up	117	* If IRECLAIM is set this inode is
		118	* on its way out of the system,
216	* we need to pause and try again.	119	* we need to pause and try again.
217	*/	120	*/
218	if (xfs_iflags_test(ip, XFS_INEW)) {	121	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
219	read_unlock(&ih->ih_lock);	122	read_unlock(&pag->pag_ici_lock);
220	delay(1);	123	delay(1);
221	XFS_STATS_INC(xs_ig_frecycle);	124	XFS_STATS_INC(xs_ig_frecycle);
222		125
223	goto again;	126	goto again;
224	}	127	}
		128	ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
225		129
226	inode_vp = XFS_ITOV_NULL(ip);	130	/*
227	if (inode_vp == NULL) {	131	* If lookup is racing with unlink, then we
228	/*	132	* should return an error immediately so we
229	* If IRECLAIM is set this inode is	133	* don't remove it from the reclaim list and
230	* on its way out of the system,	134	* potentially leak the inode.
231	* we need to pause and try again.	135	*/
232	*/	136	if ((ip->i_d.di_mode == 0) &&
233	if (xfs_iflags_test(ip, XFS_IRECLAIM)) {	137	!(flags & XFS_IGET_CREATE)) {
234	read_unlock(&ih->ih_lock);	138	read_unlock(&pag->pag_ici_lock);
235	delay(1);	139	xfs_put_perag(mp, pag);
236	XFS_STATS_INC(xs_ig_frecycle);	140	return ENOENT;
237		141	}
238	goto again;
239	}
240	ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
241
242	/*
243	* If lookup is racing with unlink, then we
244	* should return an error immediately so we
245	* don't remove it from the reclaim list and
246	* potentially leak the inode.
247	*/
248	if ((ip->i_d.di_mode == 0) &&
249	!(flags & XFS_IGET_CREATE)) {
250	read_unlock(&ih->ih_lock);
251	return ENOENT;
252	}
253
254	/*
255	* There may be transactions sitting in the
256	* incore log buffers or being flushed to disk
257	* at this time. We can't clear the
258	* XFS_IRECLAIMABLE flag until these
259	* transactions have hit the disk, otherwise we
260	* will void the guarantee the flag provides
261	* xfs_iunpin()
262	*/
263	if (xfs_ipincount(ip)) {
264	read_unlock(&ih->ih_lock);
265	xfs_log_force(mp, 0,
266	XFS_LOG_FORCE\|XFS_LOG_SYNC);
267	XFS_STATS_INC(xs_ig_frecycle);
268	goto again;
269	}
270
271	vn_trace_exit(vp, "xfs_iget.alloc",
272	(inst_t *)__return_address);
273		142
274	XFS_STATS_INC(xs_ig_found);	143	/*
		144	* There may be transactions sitting in the
		145	* incore log buffers or being flushed to disk
		146	* at this time. We can't clear the
		147	* XFS_IRECLAIMABLE flag until these
		148	* transactions have hit the disk, otherwise we
		149	* will void the guarantee the flag provides
		150	* xfs_iunpin()
		151	*/
		152	if (xfs_ipincount(ip)) {
		153	read_unlock(&pag->pag_ici_lock);
		154	xfs_log_force(mp, 0,
		155	XFS_LOG_FORCE\|XFS_LOG_SYNC);
		156	XFS_STATS_INC(xs_ig_frecycle);
		157	goto again;
		158	}
275		159
276	xfs_iflags_clear(ip, XFS_IRECLAIMABLE);	160	vn_trace_exit(vp, "xfs_iget.alloc",
277	version = ih->ih_version;	161	(inst_t *)__return_address);
278	read_unlock(&ih->ih_lock);
279	xfs_ihash_promote(ih, ip, version);
280		162
281	XFS_MOUNT_ILOCK(mp);	163	XFS_STATS_INC(xs_ig_found);
282	list_del_init(&ip->i_reclaim);
283	XFS_MOUNT_IUNLOCK(mp);
284		164
285	goto finish_inode;	165	xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
		166	read_unlock(&pag->pag_ici_lock);
286		167
287	} else if (vp != inode_vp) {	168	XFS_MOUNT_ILOCK(mp);
288	struct inode *inode = vn_to_inode(inode_vp);	169	list_del_init(&ip->i_reclaim);
		170	XFS_MOUNT_IUNLOCK(mp);
289		171
290	/* The inode is being torn down, pause and	172	goto finish_inode;
291	* try again.
292	*/
293	if (inode->i_state & (I_FREEING \| I_CLEAR)) {
294	read_unlock(&ih->ih_lock);
295	delay(1);
296	XFS_STATS_INC(xs_ig_frecycle);
297		173
298	goto again;	174	} else if (vp != inode_vp) {
299	}	175	struct inode *inode = vn_to_inode(inode_vp);
300	/* Chances are the other vnode (the one in the inode) is being torn
301	* down right now, and we landed on top of it. Question is, what do
302	* we do? Unhook the old inode and hook up the new one?
303	*/
304	cmn_err(CE_PANIC,
305	"xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
306	inode_vp, vp);
307	}
308		176
309	/*	177	/* The inode is being torn down, pause and
310	* Inode cache hit: if ip is not at the front of	178	* try again.
311	* its hash chain, move it there now.
312	* Do this with the lock held for update, but
313	* do statistics after releasing the lock.
314	*/	179	*/
315	version = ih->ih_version;	180	if (inode->i_state & (I_FREEING \| I_CLEAR)) {
316	read_unlock(&ih->ih_lock);	181	read_unlock(&pag->pag_ici_lock);
317	xfs_ihash_promote(ih, ip, version);	182	delay(1);
318	XFS_STATS_INC(xs_ig_found);	183	XFS_STATS_INC(xs_ig_frecycle);
319		184
320	finish_inode:	185	goto again;
321	if (ip->i_d.di_mode == 0) {
322	if (!(flags & XFS_IGET_CREATE))
323	return ENOENT;
324	xfs_iocore_inode_reinit(ip);
325	}	186	}
		187	/* Chances are the other vnode (the one in the inode) is being torn
		188	* down right now, and we landed on top of it. Question is, what do
		189	* we do? Unhook the old inode and hook up the new one?
		190	*/
		191	cmn_err(CE_PANIC,
		192	"xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
		193	inode_vp, vp);
		194	}
326		195
327	if (lock_flags != 0)	196	/*
328	xfs_ilock(ip, lock_flags);	197	* Inode cache hit
		198	*/
		199	read_unlock(&pag->pag_ici_lock);
		200	XFS_STATS_INC(xs_ig_found);
329		201
330	xfs_iflags_clear(ip, XFS_ISTALE);	202	finish_inode:
331	vn_trace_exit(vp, "xfs_iget.found",	203	if (ip->i_d.di_mode == 0) {
332	(inst_t *)__return_address);	204	if (!(flags & XFS_IGET_CREATE)) {
333	goto return_ip;	205	xfs_put_perag(mp, pag);
		206	return ENOENT;
		207	}
		208	xfs_iocore_inode_reinit(ip);
334	}	209	}
		210
		211	if (lock_flags != 0)
		212	xfs_ilock(ip, lock_flags);
		213
		214	xfs_iflags_clear(ip, XFS_ISTALE);
		215	vn_trace_exit(vp, "xfs_iget.found",
		216	(inst_t *)__return_address);
		217	goto return_ip;
335	}	218	}
336		219
337	/*	220	/*
338	* Inode cache miss: save the hash chain version stamp and unlock	221	* Inode cache miss
339	* the chain, so we don't deadlock in vn_alloc.
340	*/	222	*/
		223	read_unlock(&pag->pag_ici_lock);
341	XFS_STATS_INC(xs_ig_missed);	224	XFS_STATS_INC(xs_ig_missed);
342		225
343	version = ih->ih_version;
344
345	read_unlock(&ih->ih_lock);
346
347	/*	226	/*
348	* Read the disk inode attributes into a new inode structure and get	227	* Read the disk inode attributes into a new inode structure and get
349	* a new vnode for it. This should also initialize i_ino and i_mount.	228	* a new vnode for it. This should also initialize i_ino and i_mount.
350	*/	229	*/
351	error = xfs_iread(mp, tp, ino, &ip, bno,	230	error = xfs_iread(mp, tp, ino, &ip, bno,
352	(flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);	231	(flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
353	if (error)	232	if (error) {
		233	xfs_put_perag(mp, pag);
354	return error;	234	return error;
		235	}
355		236
356	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);	237	vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address);
357		238
358	xfs_inode_lock_init(ip, vp);	239	xfs_inode_lock_init(ip, vp);
359	xfs_iocore_inode_init(ip);	240	xfs_iocore_inode_init(ip);
360
361	if (lock_flags)	241	if (lock_flags)
362	xfs_ilock(ip, lock_flags);	242	xfs_ilock(ip, lock_flags);
363		243
364	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {	244	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
365	xfs_idestroy(ip);	245	xfs_idestroy(ip);
		246	xfs_put_perag(mp, pag);
366	return ENOENT;	247	return ENOENT;
367	}	248	}
368		249
369	/*	250	/*
370	* Put ip on its hash chain, unless someone else hashed a duplicate	251	* This is a bit messy - we preallocate everything we _might_
371	* after we released the hash lock.	252	* need before we pick up the ici lock. That way we don't have to
		253	* juggle locks and go all the way back to the start.
372	*/	254	*/
373	write_lock(&ih->ih_lock);	255	new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
		256	if (radix_tree_preload(GFP_KERNEL)) {
		257	delay(1);
		258	goto again;
		259	}
		260	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
		261	first_index = agino & mask;
		262	write_lock(&pag->pag_ici_lock);
374		263
375	if (ih->ih_version != version) {	264	/*
376	for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) {	265	* Find the cluster if it exists
377	if (iq->i_ino == ino) {	266	*/
378	write_unlock(&ih->ih_lock);	267	icl = NULL;
379	xfs_idestroy(ip);	268	if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
		269	first_index, 1)) {
		270	if ((iq->i_ino & mask) == first_index)
		271	icl = iq->i_cluster;
		272	}
380		273
381	XFS_STATS_INC(xs_ig_dup);	274	/*
382	goto again;	275	* insert the new inode
383	}	276	*/
384	}	277	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
		278	if (unlikely(error)) {
		279	BUG_ON(error != -EEXIST);
		280	write_unlock(&pag->pag_ici_lock);
		281	radix_tree_preload_end();
		282	xfs_idestroy(ip);
		283	XFS_STATS_INC(xs_ig_dup);
		284	goto again;
385	}	285	}
386		286
387	/*	287	/*
388	* These values _must_ be set before releasing ihlock!	288	* These values _must_ be set before releasing ihlock!
389	*/	289	*/
390	ip->i_hash = ih;
391	if ((iq = ih->ih_next)) {
392	iq->i_prevp = &ip->i_next;
393	}
394	ip->i_next = iq;
395	ip->i_prevp = &ih->ih_next;
396	ih->ih_next = ip;
397	ip->i_udquot = ip->i_gdquot = NULL;	290	ip->i_udquot = ip->i_gdquot = NULL;
398	ih->ih_version++;
399	xfs_iflags_set(ip, XFS_INEW);	291	xfs_iflags_set(ip, XFS_INEW);
400	write_unlock(&ih->ih_lock);
401		292
402	/*	293	ASSERT(ip->i_cluster == NULL);
403	* put ip on its cluster's hash chain
404	*/
405	ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL &&
406	ip->i_cnext == NULL);
407
408	chlnew = NULL;
409	ch = XFS_CHASH(mp, ip->i_blkno);
410	chlredo:
411	s = mutex_spinlock(&ch->ch_lock);
412	for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
413	if (chl->chl_blkno == ip->i_blkno) {
414
415	/* insert this inode into the doubly-linked list
416	* where chl points */
417	if ((iq = chl->chl_ip)) {
418	ip->i_cprev = iq->i_cprev;
419	iq->i_cprev->i_cnext = ip;
420	iq->i_cprev = ip;
421	ip->i_cnext = iq;
422	} else {
423	ip->i_cnext = ip;
424	ip->i_cprev = ip;
425	}
426	chl->chl_ip = ip;
427	ip->i_chash = chl;
428	break;
429	}
430	}
431		294
432	/* no hash list found for this block; add a new hash list */	295	if (!icl) {
433	if (chl == NULL) {	296	spin_lock_init(&new_icl->icl_lock);
434	if (chlnew == NULL) {	297	INIT_HLIST_HEAD(&new_icl->icl_inodes);
435	mutex_spinunlock(&ch->ch_lock, s);	298	icl = new_icl;
436	ASSERT(xfs_chashlist_zone != NULL);	299	new_icl = NULL;
437	chlnew = (xfs_chashlist_t *)
438	kmem_zone_alloc(xfs_chashlist_zone,
439	KM_SLEEP);
440	ASSERT(chlnew != NULL);
441	goto chlredo;
442	} else {
443	ip->i_cnext = ip;
444	ip->i_cprev = ip;
445	ip->i_chash = chlnew;
446	chlnew->chl_ip = ip;
447	chlnew->chl_blkno = ip->i_blkno;
448	if (ch->ch_list)
449	ch->ch_list->chl_prev = chlnew;
450	chlnew->chl_next = ch->ch_list;
451	chlnew->chl_prev = NULL;
452	ch->ch_list = chlnew;
453	chlnew = NULL;
454	}
455	} else {	300	} else {
456	if (chlnew != NULL) {	301	ASSERT(!hlist_empty(&icl->icl_inodes));
457	kmem_zone_free(xfs_chashlist_zone, chlnew);
458	}
459	}	302	}
		303	spin_lock(&icl->icl_lock);
		304	hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
		305	ip->i_cluster = icl;
		306	spin_unlock(&icl->icl_lock);
460		307
461	mutex_spinunlock(&ch->ch_lock, s);	308	write_unlock(&pag->pag_ici_lock);
462		309	radix_tree_preload_end();
		310	if (new_icl)
		311	kmem_zone_free(xfs_icluster_zone, new_icl);
463		312
464	/*	313	/*
465	* Link ip to its mount and thread it on the mount's inode list.	314	* Link ip to its mount and thread it on the mount's inode list.
@@ -478,6 +327,7 @@ finish_inode:
478	mp->m_inodes = ip;	327	mp->m_inodes = ip;
479		328
480	XFS_MOUNT_IUNLOCK(mp);	329	XFS_MOUNT_IUNLOCK(mp);
		330	xfs_put_perag(mp, pag);
481		331
482	return_ip:	332	return_ip:
483	ASSERT(ip->i_df.if_ext_max ==	333	ASSERT(ip->i_df.if_ext_max ==
@@ -587,32 +437,19 @@ xfs_inode_incore(xfs_mount_t *mp,
587	xfs_ino_t ino,	437	xfs_ino_t ino,
588	xfs_trans_t *tp)	438	xfs_trans_t *tp)
589	{	439	{
590	xfs_ihash_t *ih;
591	xfs_inode_t *ip;	440	xfs_inode_t *ip;
592	ulong version;	441	xfs_perag_t *pag;
593		442
594	ih = XFS_IHASH(mp, ino);	443	pag = xfs_get_perag(mp, ino);
595	read_lock(&ih->ih_lock);	444	read_lock(&pag->pag_ici_lock);
596	for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {	445	ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
597	if (ip->i_ino == ino) {	446	read_unlock(&pag->pag_ici_lock);
598	/*	447	xfs_put_perag(mp, pag);
599	* If we find it and tp matches, return it.	448
600	* Also move it to the front of the hash list	449	/* the returned inode must match the transaction */
601	* if we find it and it is not already there.	450	if (ip && (ip->i_transp != tp))
602	* Otherwise break from the loop and return	451	return NULL;
603	* NULL.	452	return ip;
604	*/
605	if (ip->i_transp == tp) {
606	version = ih->ih_version;
607	read_unlock(&ih->ih_lock);
608	xfs_ihash_promote(ih, ip, version);
609	return (ip);
610	}
611	break;
612	}
613	}
614	read_unlock(&ih->ih_lock);
615	return (NULL);
616	}	453	}
617		454
618	/*	455	/*
@@ -718,58 +555,26 @@ void
718	xfs_iextract(	555	xfs_iextract(
719	xfs_inode_t *ip)	556	xfs_inode_t *ip)
720	{	557	{
721	xfs_ihash_t *ih;	558	xfs_mount_t *mp = ip->i_mount;
		559	xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
722	xfs_inode_t *iq;	560	xfs_inode_t *iq;
723	xfs_mount_t *mp;	561
724	xfs_chash_t *ch;	562	write_lock(&pag->pag_ici_lock);
725	xfs_chashlist_t chl, chm;	563	radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
726	SPLDECL(s);	564	write_unlock(&pag->pag_ici_lock);
727		565	xfs_put_perag(mp, pag);
728	ih = ip->i_hash;
729	write_lock(&ih->ih_lock);
730	if ((iq = ip->i_next)) {
731	iq->i_prevp = ip->i_prevp;
732	}
733	*ip->i_prevp = iq;
734	ih->ih_version++;
735	write_unlock(&ih->ih_lock);
736		566
737	/*	567	/*
738	* Remove from cluster hash list	568	* Remove from cluster list
739	* 1) delete the chashlist if this is the last inode on the chashlist
740	* 2) unchain from list of inodes
741	* 3) point chashlist->chl_ip to 'chl_next' if to this inode.
742	*/	569	*/
743	mp = ip->i_mount;	570	mp = ip->i_mount;
744	ch = XFS_CHASH(mp, ip->i_blkno);	571	spin_lock(&ip->i_cluster->icl_lock);
745	s = mutex_spinlock(&ch->ch_lock);	572	hlist_del(&ip->i_cnode);
746		573	spin_unlock(&ip->i_cluster->icl_lock);
747	if (ip->i_cnext == ip) {	574
748	/* Last inode on chashlist */	575	/* was last inode in cluster? */
749	ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);	576	if (hlist_empty(&ip->i_cluster->icl_inodes))
750	ASSERT(ip->i_chash != NULL);	577	kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
751	chm=NULL;
752	chl = ip->i_chash;
753	if (chl->chl_prev)
754	chl->chl_prev->chl_next = chl->chl_next;
755	else
756	ch->ch_list = chl->chl_next;
757	if (chl->chl_next)
758	chl->chl_next->chl_prev = chl->chl_prev;
759	kmem_zone_free(xfs_chashlist_zone, chl);
760	} else {
761	/* delete one inode from a non-empty list */
762	iq = ip->i_cnext;
763	iq->i_cprev = ip->i_cprev;
764	ip->i_cprev->i_cnext = iq;
765	if (ip->i_chash->chl_ip == ip) {
766	ip->i_chash->chl_ip = iq;
767	}
768	ip->i_chash = __return_address;
769	ip->i_cprev = __return_address;
770	ip->i_cnext = __return_address;
771	}
772	mutex_spinunlock(&ch->ch_lock, s);
773		578
774	/*	579	/*
775	* Remove from mount's inode list.	580	* Remove from mount's inode list.