aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2016-05-18 00:09:12 -0400
committerDave Chinner <david@fromorbit.com>2016-05-18 00:09:12 -0400
commit8a17d7ddedb4d9031f046ae0e97c40b46aa69db5 (patch)
tree36509f042caa361bc2254ae12fe70f5fe32aa77c /fs/xfs
parent1f2dcfe89edac4e3bf5b76c56f745191f921fd2a (diff)
xfs: mark reclaimed inodes invalid earlier
The last thing we do before using call_rcu() on an xfs_inode to be freed is mark it as invalid. This means there is a window between when we know for certain that the inode is going to be freed and when we do actually mark it as "freed". This is important in the context of RCU lookups - we can look up the inode, find that it is valid, and then use it as such not realising that it is in the final stages of being freed. As such, mark the inode as being invalid the moment we know it is going to be reclaimed. This can be done while we still hold the XFS_ILOCK_EXCL and the flush lock in xfs_inode_reclaim, meaning that it occurs well before we remove it from the radix tree, and that the i_flags_lock, the XFS_ILOCK and the inode flush lock all act as synchronisation points for detecting that an inode is about to go away. For defensive purposes, this allows us to add a further check to xfs_iflush_cluster to ensure we skip inodes that are being freed after we grab the XFS_ILOCK_SHARED and the flush lock - we know that if the inode number if valid while we have these locks held we know that it has not progressed through reclaim to the point where it is clean and is about to be freed. [bfoster: fixed __xfs_inode_clear_reclaim() using ip->i_ino after it had already been zeroed.] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_icache.c46
-rw-r--r--fs/xfs/xfs_inode.c13
2 files changed, 47 insertions, 12 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0c94cde41016..57fcd5917a66 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -114,6 +114,18 @@ xfs_inode_free_callback(
114 kmem_zone_free(xfs_inode_zone, ip); 114 kmem_zone_free(xfs_inode_zone, ip);
115} 115}
116 116
117static void
118__xfs_inode_free(
119 struct xfs_inode *ip)
120{
121 /* asserts to verify all state is correct here */
122 ASSERT(atomic_read(&ip->i_pincount) == 0);
123 ASSERT(!xfs_isiflocked(ip));
124 XFS_STATS_DEC(ip->i_mount, vn_active);
125
126 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
127}
128
117void 129void
118xfs_inode_free( 130xfs_inode_free(
119 struct xfs_inode *ip) 131 struct xfs_inode *ip)
@@ -129,12 +141,7 @@ xfs_inode_free(
129 ip->i_ino = 0; 141 ip->i_ino = 0;
130 spin_unlock(&ip->i_flags_lock); 142 spin_unlock(&ip->i_flags_lock);
131 143
132 /* asserts to verify all state is correct here */ 144 __xfs_inode_free(ip);
133 ASSERT(atomic_read(&ip->i_pincount) == 0);
134 ASSERT(!xfs_isiflocked(ip));
135 XFS_STATS_DEC(ip->i_mount, vn_active);
136
137 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
138} 145}
139 146
140/* 147/*
@@ -772,8 +779,7 @@ __xfs_inode_set_reclaim_tag(
772 if (!pag->pag_ici_reclaimable) { 779 if (!pag->pag_ici_reclaimable) {
773 /* propagate the reclaim tag up into the perag radix tree */ 780 /* propagate the reclaim tag up into the perag radix tree */
774 spin_lock(&ip->i_mount->m_perag_lock); 781 spin_lock(&ip->i_mount->m_perag_lock);
775 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 782 radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno,
776 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
777 XFS_ICI_RECLAIM_TAG); 783 XFS_ICI_RECLAIM_TAG);
778 spin_unlock(&ip->i_mount->m_perag_lock); 784 spin_unlock(&ip->i_mount->m_perag_lock);
779 785
@@ -817,8 +823,7 @@ __xfs_inode_clear_reclaim(
817 if (!pag->pag_ici_reclaimable) { 823 if (!pag->pag_ici_reclaimable) {
818 /* clear the reclaim tag from the perag radix tree */ 824 /* clear the reclaim tag from the perag radix tree */
819 spin_lock(&ip->i_mount->m_perag_lock); 825 spin_lock(&ip->i_mount->m_perag_lock);
820 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 826 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno,
821 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
822 XFS_ICI_RECLAIM_TAG); 827 XFS_ICI_RECLAIM_TAG);
823 spin_unlock(&ip->i_mount->m_perag_lock); 828 spin_unlock(&ip->i_mount->m_perag_lock);
824 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 829 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
@@ -929,6 +934,7 @@ xfs_reclaim_inode(
929 int sync_mode) 934 int sync_mode)
930{ 935{
931 struct xfs_buf *bp = NULL; 936 struct xfs_buf *bp = NULL;
937 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */
932 int error; 938 int error;
933 939
934restart: 940restart:
@@ -993,6 +999,22 @@ restart:
993 999
994 xfs_iflock(ip); 1000 xfs_iflock(ip);
995reclaim: 1001reclaim:
1002 /*
1003 * Because we use RCU freeing we need to ensure the inode always appears
1004 * to be reclaimed with an invalid inode number when in the free state.
1005 * We do this as early as possible under the ILOCK and flush lock so
1006 * that xfs_iflush_cluster() can be guaranteed to detect races with us
1007 * here. By doing this, we guarantee that once xfs_iflush_cluster has
1008 * locked both the XFS_ILOCK and the flush lock that it will see either
1009 * a valid, flushable inode that will serialise correctly against the
1010 * locks below, or it will see a clean (and invalid) inode that it can
1011 * skip.
1012 */
1013 spin_lock(&ip->i_flags_lock);
1014 ip->i_flags = XFS_IRECLAIM;
1015 ip->i_ino = 0;
1016 spin_unlock(&ip->i_flags_lock);
1017
996 xfs_ifunlock(ip); 1018 xfs_ifunlock(ip);
997 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1019 xfs_iunlock(ip, XFS_ILOCK_EXCL);
998 1020
@@ -1006,7 +1028,7 @@ reclaim:
1006 */ 1028 */
1007 spin_lock(&pag->pag_ici_lock); 1029 spin_lock(&pag->pag_ici_lock);
1008 if (!radix_tree_delete(&pag->pag_ici_root, 1030 if (!radix_tree_delete(&pag->pag_ici_root,
1009 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 1031 XFS_INO_TO_AGINO(ip->i_mount, ino)))
1010 ASSERT(0); 1032 ASSERT(0);
1011 __xfs_inode_clear_reclaim(pag, ip); 1033 __xfs_inode_clear_reclaim(pag, ip);
1012 spin_unlock(&pag->pag_ici_lock); 1034 spin_unlock(&pag->pag_ici_lock);
@@ -1023,7 +1045,7 @@ reclaim:
1023 xfs_qm_dqdetach(ip); 1045 xfs_qm_dqdetach(ip);
1024 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1046 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1025 1047
1026 xfs_inode_free(ip); 1048 __xfs_inode_free(ip);
1027 return error; 1049 return error;
1028 1050
1029out_ifunlock: 1051out_ifunlock:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3cbc9031731b..e3b27982b3b2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3239,6 +3239,19 @@ xfs_iflush_cluster(
3239 continue; 3239 continue;
3240 } 3240 }
3241 3241
3242
3243 /*
3244 * Check the inode number again, just to be certain we are not
3245 * racing with freeing in xfs_reclaim_inode(). See the comments
3246 * in that function for more information as to why the initial
3247 * check is not sufficient.
3248 */
3249 if (!iq->i_ino) {
3250 xfs_ifunlock(iq);
3251 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3252 continue;
3253 }
3254
3242 /* 3255 /*
3243 * arriving here means that this inode can be flushed. First 3256 * arriving here means that this inode can be flushed. First
3244 * re-check that it's dirty before flushing. 3257 * re-check that it's dirty before flushing.