3 files changed, 141 insertions, 42 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..fd38682da851 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
 {
        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
        /* nothing to sync during shutdown */
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return EFSCORRUPTED;
-        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-        if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-                return ENOENT;
        /* If we can't grab the inode, it must on it's way to reclaim. */
        if (!igrab(inode))
                return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
        /* inode is valid */
        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
 }
 STATIC int
@@ -98,12 +118,12 @@ restart:
                int             error = 0;
                int             i;
-                read_lock(&pag->pag_ici_lock);
+                rcu_read_lock();
                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
                if (!nr_found) {
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        break;
                }
@@ -118,18 +138,26 @@ restart:
                                batch[i] = NULL;
                        /*
-                         * Update the index for the next lookup. Catch overflows
+                         * Update the index for the next lookup. Catch
-                         * into the next AG range which can occur if we have inodes
+                         * overflows into the next AG range which can occur if
-                         * in the last block of the AG and we are currently
+                         * we have inodes in the last block of the AG and we
-                         * pointing to the last inode.
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                done = 1;
                }
                /* unlock now we've grabbed the inodes. */
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
        struct xfs_inode        *ip,
        int                     flags)
 {
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
        /*
-         * do some unlocked checks first to avoid unnecceary lock traffic.
+         * do some unlocked checks first to avoid unnecessary lock traffic.
         * The first is a flush lock check, the second is a already in reclaim
         * check. Only do these checks if we are not going to block on locks.
         */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
         * The radix tree lock here protects a thread in xfs_iget from racing
         * with us starting reclaim on the inode.  Once we have the
         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
         */
        spin_lock(&ip->i_flags_lock);
-        ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-        if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                /* ignore as it is already under reclaim */
+                /* not a reclaim candidate. */
                spin_unlock(&ip->i_flags_lock);
                return 1;
        }
@@ -864,14 +902,14 @@ restart:
                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
                        int     i;
-                        write_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        nr_found = radix_tree_gang_lookup_tag(
                                        &pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH,
                                        XFS_ICI_RECLAIM_TAG);
                        if (!nr_found) {
-                                write_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                break;
                        }
@@ -891,14 +929,24 @@ restart:
                                 * occur if we have inodes in the last block of
                                 * the AG and we are currently pointing to the
                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
                                        done = 1;
                        }
                        /* unlock now we've grabbed the inodes. */
-                        write_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        for (i = 0; i < nr_found; i++) {
                                if (!batch[i])
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 9fae47556604..04ed09b907b8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
        ASSERT(atomic_read(&ip->i_pincount) == 0);
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        ASSERT(ip->i_ino == 0);
        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
        lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
        ip->i_size = 0;
        ip->i_new_size = 0;
-        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
@@ -159,6 +157,16 @@ xfs_inode_free(
        ASSERT(!spin_is_locked(&ip->i_flags_lock));
        ASSERT(completion_done(&ip->i_flush));
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
        call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
 }
@@ -169,14 +177,29 @@ static int
 xfs_iget_cache_hit(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
        int                     flags,
-        int                     lock_flags) __releases(pag->pag_ici_lock)
+        int                     lock_flags) __releases(RCU)
 {
        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
        /*
         * If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
                ip->i_flags |= XFS_IRECLAIM;
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                error = -inode_init_always(mp->m_super, inode);
                if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
                         * Re-initializing the inode failed, and we are in deep
                         * trouble.  Try to re-add it to the reclaim list.
                         */
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        spin_lock(&ip->i_flags_lock);
                        ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
                /* We've got a live one. */
                spin_unlock(&ip->i_flags_lock);
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                trace_xfs_iget_hit(ip);
        }
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
 out_error:
        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        return error;
 }
@@ -397,7 +420,7 @@ xfs_iget(
        xfs_agino_t     agino;
        /* reject inode numbers outside existing AGs */
-        if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
                return EINVAL;
        /* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
 again:
        error = 0;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
                if (error)
                        goto out_error_or_again;
        } else {
-                read_unlock(&pag->pag_ici_lock);
+                rcu_read_unlock();
                XFS_STATS_INC(xs_ig_missed);
                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..43ffd9079106 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
                 */
                for (i = 0; i < ninodes; i++) {
 retry:
-                        read_lock(&pag->pag_ici_lock);
+                        rcu_read_lock();
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or stale, nothing to do */
+                        /* Inode not in memory, nothing to do */
-                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
+                        if (!ip) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                continue;
                        }
                        /*
+                         * because this is an RCU protected lookup, we could
+                         * find a recently freed or even reallocated inode
+                         * during the lookup. We need to check under the
+                         * i_flags_lock for a valid inode here. Skip it if it
+                         * is not valid, the wrong inode or stale.
+                         */
+                        spin_lock(&ip->i_flags_lock);
+                        if (ip->i_ino != inum + i ||
+                            __xfs_iflags_test(ip, XFS_ISTALE)) {
+                                spin_unlock(&ip->i_flags_lock);
+                                rcu_read_unlock();
+                                continue;
+                        }
+                        spin_unlock(&ip->i_flags_lock);
+                        /*
                         * Don't try to lock/unlock the current inode, but we
                         * _cannot_ skip the other inodes that we did not find
                         * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
                         */
                        if (ip != free_ip &&
                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                                read_unlock(&pag->pag_ici_lock);
+                                rcu_read_unlock();
                                delay(1);
                                goto retry;
                        }
-                        read_unlock(&pag->pag_ici_lock);
+                        rcu_read_unlock();
                        xfs_iflock(ip);
                        xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
-        read_lock(&pag->pag_ici_lock);
+        rcu_read_lock();
        /* really need a gang lookup range call here */
        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
                                        first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
                iq = ilist[i];
                if (iq == ip)
                        continue;
-                /* if the inode lies outside this cluster, we're done. */
-                if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
+                /*
-                        break;
+                 * because this is an RCU protected lookup, we could find a
+                 * recently freed or even reallocated inode during the lookup.
+                 * We need to check under the i_flags_lock for a valid inode
+                 * here. Skip it if it is not valid or the wrong inode.
+                 */
+                spin_lock(&ip->i_flags_lock);
+                if (!ip->i_ino ||
+                    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
+                        spin_unlock(&ip->i_flags_lock);
+                        continue;
+                }
+                spin_unlock(&ip->i_flags_lock);
                /*
                 * Do an un-protected check to see if the inode is dirty and
                 * is a candidate for flushing.  These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
        }
 out_free:
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        kmem_free(ilist);
 out_put:
        xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
         * Corruption detected in the clustering loop.  Invalidate the
         * inode buffer and shut down the filesystem.
         */
-        read_unlock(&pag->pag_ici_lock);
+        rcu_read_unlock();
        /*
         * Clean up the buffer.  If it was B_DELWRI, just release it --
         * brelse can handle it with no problems.  If not, shut down the

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index afb0d7cfad1c..fd38682da851 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53	{	53	{
54	struct inode *inode = VFS_I(ip);	54	struct inode *inode = VFS_I(ip);
55		55
		56	ASSERT(rcu_read_lock_held());
		57
		58	/*
		59	* check for stale RCU freed inode
		60	*
		61	* If the inode has been reallocated, it doesn't matter if it's not in
		62	* the AG we are walking - we are walking for writeback, so if it
		63	* passes all the "valid inode" checks and is dirty, then we'll write
		64	* it back anyway. If it has been reallocated and still being
		65	* initialised, the XFS_INEW check below will catch it.
		66	*/
		67	spin_lock(&ip->i_flags_lock);
		68	if (!ip->i_ino)
		69	goto out_unlock_noent;
		70
		71	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
		72	if (__xfs_iflags_test(ip, XFS_INEW \| XFS_IRECLAIMABLE \| XFS_IRECLAIM))
		73	goto out_unlock_noent;
		74	spin_unlock(&ip->i_flags_lock);
		75
56	/* nothing to sync during shutdown */	76	/* nothing to sync during shutdown */
57	if (XFS_FORCED_SHUTDOWN(ip->i_mount))	77	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58	return EFSCORRUPTED;	78	return EFSCORRUPTED;
59		79
60	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61	if (xfs_iflags_test(ip, XFS_INEW \| XFS_IRECLAIMABLE \| XFS_IRECLAIM))
62	return ENOENT;
63
64	/* If we can't grab the inode, it must on it's way to reclaim. */	80	/* If we can't grab the inode, it must on it's way to reclaim. */
65	if (!igrab(inode))	81	if (!igrab(inode))
66	return ENOENT;	82	return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72		88
73	/* inode is valid */	89	/* inode is valid */
74	return 0;	90	return 0;
		91
		92	out_unlock_noent:
		93	spin_unlock(&ip->i_flags_lock);
		94	return ENOENT;
75	}	95	}
76		96
77	STATIC int	97	STATIC int
@@ -98,12 +118,12 @@ restart:
98	int error = 0;	118	int error = 0;
99	int i;	119	int i;
100		120
101	read_lock(&pag->pag_ici_lock);	121	rcu_read_lock();
102	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,	122	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103	(void **)batch, first_index,	123	(void **)batch, first_index,
104	XFS_LOOKUP_BATCH);	124	XFS_LOOKUP_BATCH);
105	if (!nr_found) {	125	if (!nr_found) {
106	read_unlock(&pag->pag_ici_lock);	126	rcu_read_unlock();
107	break;	127	break;
108	}	128	}
109		129
@@ -118,18 +138,26 @@ restart:
118	batch[i] = NULL;	138	batch[i] = NULL;
119		139
120	/*	140	/*
121	* Update the index for the next lookup. Catch overflows	141	* Update the index for the next lookup. Catch
122	* into the next AG range which can occur if we have inodes	142	* overflows into the next AG range which can occur if
123	* in the last block of the AG and we are currently	143	* we have inodes in the last block of the AG and we
124	* pointing to the last inode.	144	* are currently pointing to the last inode.
		145	*
		146	* Because we may see inodes that are from the wrong AG
		147	* due to RCU freeing and reallocation, only update the
		148	* index if it lies in this AG. It was a race that lead
		149	* us to see this inode, so another lookup from the
		150	* same index will not find it again.
125	*/	151	*/
		152	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
		153	continue;
126	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);	154	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))	155	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128	done = 1;	156	done = 1;
129	}	157	}
130		158
131	/* unlock now we've grabbed the inodes. */	159	/* unlock now we've grabbed the inodes. */
132	read_unlock(&pag->pag_ici_lock);	160	rcu_read_unlock();
133		161
134	for (i = 0; i < nr_found; i++) {	162	for (i = 0; i < nr_found; i++) {
135	if (!batch[i])	163	if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
639	struct xfs_inode *ip,	667	struct xfs_inode *ip,
640	int flags)	668	int flags)
641	{	669	{
		670	ASSERT(rcu_read_lock_held());
		671
		672	/* quick check for stale RCU freed inode */
		673	if (!ip->i_ino)
		674	return 1;
642		675
643	/*	676	/*
644	* do some unlocked checks first to avoid unnecceary lock traffic.	677	* do some unlocked checks first to avoid unnecessary lock traffic.
645	* The first is a flush lock check, the second is a already in reclaim	678	* The first is a flush lock check, the second is a already in reclaim
646	* check. Only do these checks if we are not going to block on locks.	679	* check. Only do these checks if we are not going to block on locks.
647	*/	680	*/
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
654	* The radix tree lock here protects a thread in xfs_iget from racing	687	* The radix tree lock here protects a thread in xfs_iget from racing
655	* with us starting reclaim on the inode. Once we have the	688	* with us starting reclaim on the inode. Once we have the
656	* XFS_IRECLAIM flag set it will not touch us.	689	* XFS_IRECLAIM flag set it will not touch us.
		690	*
		691	* Due to RCU lookup, we may find inodes that have been freed and only
		692	* have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
		693	* aren't candidates for reclaim at all, so we must check the
		694	* XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657	*/	695	*/
658	spin_lock(&ip->i_flags_lock);	696	spin_lock(&ip->i_flags_lock);
659	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));	697	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) \|\|
660	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {	698	__xfs_iflags_test(ip, XFS_IRECLAIM)) {
661	/* ignore as it is already under reclaim */	699	/* not a reclaim candidate. */
662	spin_unlock(&ip->i_flags_lock);	700	spin_unlock(&ip->i_flags_lock);
663	return 1;	701	return 1;
664	}	702	}
@@ -864,14 +902,14 @@ restart:
864	struct xfs_inode *batch[XFS_LOOKUP_BATCH];	902	struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865	int i;	903	int i;
866		904
867	write_lock(&pag->pag_ici_lock);	905	rcu_read_lock();
868	nr_found = radix_tree_gang_lookup_tag(	906	nr_found = radix_tree_gang_lookup_tag(
869	&pag->pag_ici_root,	907	&pag->pag_ici_root,
870	(void **)batch, first_index,	908	(void **)batch, first_index,
871	XFS_LOOKUP_BATCH,	909	XFS_LOOKUP_BATCH,
872	XFS_ICI_RECLAIM_TAG);	910	XFS_ICI_RECLAIM_TAG);
873	if (!nr_found) {	911	if (!nr_found) {
874	write_unlock(&pag->pag_ici_lock);	912	rcu_read_unlock();
875	break;	913	break;
876	}	914	}
877		915
@@ -891,14 +929,24 @@ restart:
891	* occur if we have inodes in the last block of	929	* occur if we have inodes in the last block of
892	* the AG and we are currently pointing to the	930	* the AG and we are currently pointing to the
893	* last inode.	931	* last inode.
		932	*
		933	* Because we may see inodes that are from the
		934	* wrong AG due to RCU freeing and
		935	* reallocation, only update the index if it
		936	* lies in this AG. It was a race that lead us
		937	* to see this inode, so another lookup from
		938	* the same index will not find it again.
894	*/	939	*/
		940	if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
		941	pag->pag_agno)
		942	continue;
895	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);	943	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))	944	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897	done = 1;	945	done = 1;
898	}	946	}
899		947
900	/* unlock now we've grabbed the inodes. */	948	/* unlock now we've grabbed the inodes. */
901	write_unlock(&pag->pag_ici_lock);	949	rcu_read_unlock();
902		950
903	for (i = 0; i < nr_found; i++) {	951	for (i = 0; i < nr_found; i++) {
904	if (!batch[i])	952	if (!batch[i])


diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 9fae47556604..04ed09b907b8 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
80	ASSERT(atomic_read(&ip->i_pincount) == 0);	80	ASSERT(atomic_read(&ip->i_pincount) == 0);
81	ASSERT(!spin_is_locked(&ip->i_flags_lock));	81	ASSERT(!spin_is_locked(&ip->i_flags_lock));
82	ASSERT(completion_done(&ip->i_flush));	82	ASSERT(completion_done(&ip->i_flush));
		83	ASSERT(ip->i_ino == 0);
83		84
84	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);	85	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
85	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,	86	lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
98	ip->i_size = 0;	99	ip->i_size = 0;
99	ip->i_new_size = 0;	100	ip->i_new_size = 0;
100		101
101	/* prevent anyone from using this yet */
102	VFS_I(ip)->i_state = I_NEW;
103
104	return ip;	102	return ip;
105	}	103	}
106		104
@@ -159,6 +157,16 @@ xfs_inode_free(
159	ASSERT(!spin_is_locked(&ip->i_flags_lock));	157	ASSERT(!spin_is_locked(&ip->i_flags_lock));
160	ASSERT(completion_done(&ip->i_flush));	158	ASSERT(completion_done(&ip->i_flush));
161		159
		160	/*
		161	* Because we use RCU freeing we need to ensure the inode always
		162	* appears to be reclaimed with an invalid inode number when in the
		163	* free state. The ip->i_flags_lock provides the barrier against lookup
		164	* races.
		165	*/
		166	spin_lock(&ip->i_flags_lock);
		167	ip->i_flags = XFS_IRECLAIM;
		168	ip->i_ino = 0;
		169	spin_unlock(&ip->i_flags_lock);
162	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);	170	call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
163	}	171	}
164		172
@@ -169,14 +177,29 @@ static int
169	xfs_iget_cache_hit(	177	xfs_iget_cache_hit(
170	struct xfs_perag *pag,	178	struct xfs_perag *pag,
171	struct xfs_inode *ip,	179	struct xfs_inode *ip,
		180	xfs_ino_t ino,
172	int flags,	181	int flags,
173	int lock_flags) __releases(pag->pag_ici_lock)	182	int lock_flags) __releases(RCU)
174	{	183	{
175	struct inode *inode = VFS_I(ip);	184	struct inode *inode = VFS_I(ip);
176	struct xfs_mount *mp = ip->i_mount;	185	struct xfs_mount *mp = ip->i_mount;
177	int error;	186	int error;
178		187
		188	/*
		189	* check for re-use of an inode within an RCU grace period due to the
		190	* radix tree nodes not being updated yet. We monitor for this by
		191	* setting the inode number to zero before freeing the inode structure.
		192	* If the inode has been reallocated and set up, then the inode number
		193	* will not match, so check for that, too.
		194	*/
179	spin_lock(&ip->i_flags_lock);	195	spin_lock(&ip->i_flags_lock);
		196	if (ip->i_ino != ino) {
		197	trace_xfs_iget_skip(ip);
		198	XFS_STATS_INC(xs_ig_frecycle);
		199	error = EAGAIN;
		200	goto out_error;
		201	}
		202
180		203
181	/*	204	/*
182	* If we are racing with another cache hit that is currently	205	* If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
219	ip->i_flags \|= XFS_IRECLAIM;	242	ip->i_flags \|= XFS_IRECLAIM;
220		243
221	spin_unlock(&ip->i_flags_lock);	244	spin_unlock(&ip->i_flags_lock);
222	read_unlock(&pag->pag_ici_lock);	245	rcu_read_unlock();
223		246
224	error = -inode_init_always(mp->m_super, inode);	247	error = -inode_init_always(mp->m_super, inode);
225	if (error) {	248	if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
227	* Re-initializing the inode failed, and we are in deep	250	* Re-initializing the inode failed, and we are in deep
228	* trouble. Try to re-add it to the reclaim list.	251	* trouble. Try to re-add it to the reclaim list.
229	*/	252	*/
230	read_lock(&pag->pag_ici_lock);	253	rcu_read_lock();
231	spin_lock(&ip->i_flags_lock);	254	spin_lock(&ip->i_flags_lock);
232		255
233	ip->i_flags &= ~XFS_INEW;	256	ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
261		284
262	/* We've got a live one. */	285	/* We've got a live one. */
263	spin_unlock(&ip->i_flags_lock);	286	spin_unlock(&ip->i_flags_lock);
264	read_unlock(&pag->pag_ici_lock);	287	rcu_read_unlock();
265	trace_xfs_iget_hit(ip);	288	trace_xfs_iget_hit(ip);
266	}	289	}
267		290
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
275		298
276	out_error:	299	out_error:
277	spin_unlock(&ip->i_flags_lock);	300	spin_unlock(&ip->i_flags_lock);
278	read_unlock(&pag->pag_ici_lock);	301	rcu_read_unlock();
279	return error;	302	return error;
280	}	303	}
281		304
@@ -397,7 +420,7 @@ xfs_iget(
397	xfs_agino_t agino;	420	xfs_agino_t agino;
398		421
399	/* reject inode numbers outside existing AGs */	422	/* reject inode numbers outside existing AGs */
400	if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)	423	if (!ino \|\| XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
401	return EINVAL;	424	return EINVAL;
402		425
403	/* get the perag structure and ensure that it's inode capable */	426	/* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
406		429
407	again:	430	again:
408	error = 0;	431	error = 0;
409	read_lock(&pag->pag_ici_lock);	432	rcu_read_lock();
410	ip = radix_tree_lookup(&pag->pag_ici_root, agino);	433	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
411		434
412	if (ip) {	435	if (ip) {
413	error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);	436	error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
414	if (error)	437	if (error)
415	goto out_error_or_again;	438	goto out_error_or_again;
416	} else {	439	} else {
417	read_unlock(&pag->pag_ici_lock);	440	rcu_read_unlock();
418	XFS_STATS_INC(xs_ig_missed);	441	XFS_STATS_INC(xs_ig_missed);
419		442
420	error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,	443	error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,


diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 108c7a085f94..43ffd9079106 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000	*/	2000	*/
2001	for (i = 0; i < ninodes; i++) {	2001	for (i = 0; i < ninodes; i++) {
2002	retry:	2002	retry:
2003	read_lock(&pag->pag_ici_lock);	2003	rcu_read_lock();
2004	ip = radix_tree_lookup(&pag->pag_ici_root,	2004	ip = radix_tree_lookup(&pag->pag_ici_root,
2005	XFS_INO_TO_AGINO(mp, (inum + i)));	2005	XFS_INO_TO_AGINO(mp, (inum + i)));
2006		2006
2007	/* Inode not in memory or stale, nothing to do */	2007	/* Inode not in memory, nothing to do */
2008	if (!ip \|\| xfs_iflags_test(ip, XFS_ISTALE)) {	2008	if (!ip) {
2009	read_unlock(&pag->pag_ici_lock);	2009	rcu_read_unlock();
2010	continue;	2010	continue;
2011	}	2011	}
2012		2012
2013	/*	2013	/*
		2014	* because this is an RCU protected lookup, we could
		2015	* find a recently freed or even reallocated inode
		2016	* during the lookup. We need to check under the
		2017	* i_flags_lock for a valid inode here. Skip it if it
		2018	* is not valid, the wrong inode or stale.
		2019	*/
		2020	spin_lock(&ip->i_flags_lock);
		2021	if (ip->i_ino != inum + i \|\|
		2022	__xfs_iflags_test(ip, XFS_ISTALE)) {
		2023	spin_unlock(&ip->i_flags_lock);
		2024	rcu_read_unlock();
		2025	continue;
		2026	}
		2027	spin_unlock(&ip->i_flags_lock);
		2028
		2029	/*
2014	* Don't try to lock/unlock the current inode, but we	2030	* Don't try to lock/unlock the current inode, but we
2015	* _cannot_ skip the other inodes that we did not find	2031	* _cannot_ skip the other inodes that we did not find
2016	* in the list attached to the buffer and are not	2032	* in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019	*/	2035	*/
2020	if (ip != free_ip &&	2036	if (ip != free_ip &&
2021	!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {	2037	!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022	read_unlock(&pag->pag_ici_lock);	2038	rcu_read_unlock();
2023	delay(1);	2039	delay(1);
2024	goto retry;	2040	goto retry;
2025	}	2041	}
2026	read_unlock(&pag->pag_ici_lock);	2042	rcu_read_unlock();
2027		2043
2028	xfs_iflock(ip);	2044	xfs_iflock(ip);
2029	xfs_iflags_set(ip, XFS_ISTALE);	2045	xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629		2645
2630	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);	2646	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;	2647	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632	read_lock(&pag->pag_ici_lock);	2648	rcu_read_lock();
2633	/* really need a gang lookup range call here */	2649	/* really need a gang lookup range call here */
2634	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,	2650	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635	first_index, inodes_per_cluster);	2651	first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640	iq = ilist[i];	2656	iq = ilist[i];
2641	if (iq == ip)	2657	if (iq == ip)
2642	continue;	2658	continue;
2643	/* if the inode lies outside this cluster, we're done. */	2659
2644	if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)	2660	/*
2645	break;	2661	* because this is an RCU protected lookup, we could find a
		2662	* recently freed or even reallocated inode during the lookup.
		2663	* We need to check under the i_flags_lock for a valid inode
		2664	* here. Skip it if it is not valid or the wrong inode.
		2665	*/
		2666	spin_lock(&ip->i_flags_lock);
		2667	if (!ip->i_ino \|\|
		2668	(XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
		2669	spin_unlock(&ip->i_flags_lock);
		2670	continue;
		2671	}
		2672	spin_unlock(&ip->i_flags_lock);
		2673
2646	/*	2674	/*
2647	* Do an un-protected check to see if the inode is dirty and	2675	* Do an un-protected check to see if the inode is dirty and
2648	* is a candidate for flushing. These checks will be repeated	2676	* is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692	}	2720	}
2693		2721
2694	out_free:	2722	out_free:
2695	read_unlock(&pag->pag_ici_lock);	2723	rcu_read_unlock();
2696	kmem_free(ilist);	2724	kmem_free(ilist);
2697	out_put:	2725	out_put:
2698	xfs_perag_put(pag);	2726	xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704	* Corruption detected in the clustering loop. Invalidate the	2732	* Corruption detected in the clustering loop. Invalidate the
2705	* inode buffer and shut down the filesystem.	2733	* inode buffer and shut down the filesystem.
2706	*/	2734	*/
2707	read_unlock(&pag->pag_ici_lock);	2735	rcu_read_unlock();
2708	/*	2736	/*
2709	* Clean up the buffer. If it was B_DELWRI, just release it --	2737	* Clean up the buffer. If it was B_DELWRI, just release it --
2710	* brelse can handle it with no problems. If not, shut down the	2738	* brelse can handle it with no problems. If not, shut down the