xfs: convert inode cache lookups to use RCU locking

With delayed logging greatly increasing the sustained parallelism of inode operations, the inode cache locking is showing significant read vs write contention when inode reclaim runs at the same time as lookups. There is also a lot more write lock acquistions than there are read locks (4:1 ratio) so the read locking is not really buying us much in the way of parallelism. To avoid the read vs write contention, change the cache to use RCU locking on the read side. To avoid needing to RCU free every single inode, use the built in slab RCU freeing mechanism. This requires us to be able to detect lookups of freed inodes, so enѕure that ever freed inode has an inode number of zero and the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit lookup path, but also add a check for a zero inode number as well. We canthen convert all the read locking lockups to use RCU read side locking and hence remove all read side locking. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2010-12-17 01:29:43 -0500
committer: Dave Chinner <david@fromorbit.com> 2010-12-17 01:29:43 -0500
commit: 1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
tree: c717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_inode.c
parent: d95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)
1 files changed, 40 insertions, 12 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..43ffd9079106 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the
author	Dave Chinner <dchinner@redhat.com>	2010-12-17 01:29:43 -0500
committer	Dave Chinner <david@fromorbit.com>	2010-12-17 01:29:43 -0500
commit	1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
tree	c717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_inode.c
parent	d95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 108c7a085f94..43ffd9079106 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000	*/	2000	*/
2001	for (i = 0; i < ninodes; i++) {	2001	for (i = 0; i < ninodes; i++) {
2002	retry:	2002	retry:
2003	read_lock(&pag->pag_ici_lock);	2003	rcu_read_lock();
2004	ip = radix_tree_lookup(&pag->pag_ici_root,	2004	ip = radix_tree_lookup(&pag->pag_ici_root,
2005	XFS_INO_TO_AGINO(mp, (inum + i)));	2005	XFS_INO_TO_AGINO(mp, (inum + i)));
2006		2006
2007	/* Inode not in memory or stale, nothing to do */	2007	/* Inode not in memory, nothing to do */
2008	if (!ip \|\| xfs_iflags_test(ip, XFS_ISTALE)) {	2008	if (!ip) {
2009	read_unlock(&pag->pag_ici_lock);	2009	rcu_read_unlock();
2010	continue;	2010	continue;
2011	}	2011	}
2012		2012
2013	/*	2013	/*
		2014	* because this is an RCU protected lookup, we could
		2015	* find a recently freed or even reallocated inode
		2016	* during the lookup. We need to check under the
		2017	* i_flags_lock for a valid inode here. Skip it if it
		2018	* is not valid, the wrong inode or stale.
		2019	*/
		2020	spin_lock(&ip->i_flags_lock);
		2021	if (ip->i_ino != inum + i \|\|
		2022	__xfs_iflags_test(ip, XFS_ISTALE)) {
		2023	spin_unlock(&ip->i_flags_lock);
		2024	rcu_read_unlock();
		2025	continue;
		2026	}
		2027	spin_unlock(&ip->i_flags_lock);
		2028
		2029	/*
2014	* Don't try to lock/unlock the current inode, but we	2030	* Don't try to lock/unlock the current inode, but we
2015	* _cannot_ skip the other inodes that we did not find	2031	* _cannot_ skip the other inodes that we did not find
2016	* in the list attached to the buffer and are not	2032	* in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019	*/	2035	*/
2020	if (ip != free_ip &&	2036	if (ip != free_ip &&
2021	!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {	2037	!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022	read_unlock(&pag->pag_ici_lock);	2038	rcu_read_unlock();
2023	delay(1);	2039	delay(1);
2024	goto retry;	2040	goto retry;
2025	}	2041	}
2026	read_unlock(&pag->pag_ici_lock);	2042	rcu_read_unlock();
2027		2043
2028	xfs_iflock(ip);	2044	xfs_iflock(ip);
2029	xfs_iflags_set(ip, XFS_ISTALE);	2045	xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629		2645
2630	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);	2646	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;	2647	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632	read_lock(&pag->pag_ici_lock);	2648	rcu_read_lock();
2633	/* really need a gang lookup range call here */	2649	/* really need a gang lookup range call here */
2634	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,	2650	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635	first_index, inodes_per_cluster);	2651	first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640	iq = ilist[i];	2656	iq = ilist[i];
2641	if (iq == ip)	2657	if (iq == ip)
2642	continue;	2658	continue;
2643	/* if the inode lies outside this cluster, we're done. */	2659
2644	if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)	2660	/*
2645	break;	2661	* because this is an RCU protected lookup, we could find a
		2662	* recently freed or even reallocated inode during the lookup.
		2663	* We need to check under the i_flags_lock for a valid inode
		2664	* here. Skip it if it is not valid or the wrong inode.
		2665	*/
		2666	spin_lock(&ip->i_flags_lock);
		2667	if (!ip->i_ino \|\|
		2668	(XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
		2669	spin_unlock(&ip->i_flags_lock);
		2670	continue;
		2671	}
		2672	spin_unlock(&ip->i_flags_lock);
		2673
2646	/*	2674	/*
2647	* Do an un-protected check to see if the inode is dirty and	2675	* Do an un-protected check to see if the inode is dirty and
2648	* is a candidate for flushing. These checks will be repeated	2676	* is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692	}	2720	}
2693		2721
2694	out_free:	2722	out_free:
2695	read_unlock(&pag->pag_ici_lock);	2723	rcu_read_unlock();
2696	kmem_free(ilist);	2724	kmem_free(ilist);
2697	out_put:	2725	out_put:
2698	xfs_perag_put(pag);	2726	xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704	* Corruption detected in the clustering loop. Invalidate the	2732	* Corruption detected in the clustering loop. Invalidate the
2705	* inode buffer and shut down the filesystem.	2733	* inode buffer and shut down the filesystem.
2706	*/	2734	*/
2707	read_unlock(&pag->pag_ici_lock);	2735	rcu_read_unlock();
2708	/*	2736	/*
2709	* Clean up the buffer. If it was B_DELWRI, just release it --	2737	* Clean up the buffer. If it was B_DELWRI, just release it --
2710	* brelse can handle it with no problems. If not, shut down the	2738	* brelse can handle it with no problems. If not, shut down the