aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c84
-rw-r--r--fs/xfs/xfs_iget.c47
-rw-r--r--fs/xfs/xfs_inode.c52
3 files changed, 141 insertions, 42 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..fd38682da851 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53{ 53{
54 struct inode *inode = VFS_I(ip); 54 struct inode *inode = VFS_I(ip);
55 55
56 ASSERT(rcu_read_lock_held());
57
58 /*
59 * check for stale RCU freed inode
60 *
61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
66 */
67 spin_lock(&ip->i_flags_lock);
68 if (!ip->i_ino)
69 goto out_unlock_noent;
70
71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
56 /* nothing to sync during shutdown */ 76 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 78 return EFSCORRUPTED;
59 79
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 80 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 81 if (!igrab(inode))
66 return ENOENT; 82 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72 88
73 /* inode is valid */ 89 /* inode is valid */
74 return 0; 90 return 0;
91
92out_unlock_noent:
93 spin_unlock(&ip->i_flags_lock);
94 return ENOENT;
75} 95}
76 96
77STATIC int 97STATIC int
@@ -98,12 +118,12 @@ restart:
98 int error = 0; 118 int error = 0;
99 int i; 119 int i;
100 120
101 read_lock(&pag->pag_ici_lock); 121 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 123 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 124 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 125 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 126 rcu_read_unlock();
107 break; 127 break;
108 } 128 }
109 129
@@ -118,18 +138,26 @@ restart:
118 batch[i] = NULL; 138 batch[i] = NULL;
119 139
120 /* 140 /*
121 * Update the index for the next lookup. Catch overflows 141 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 142 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 143 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
125 */ 151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 156 done = 1;
129 } 157 }
130 158
131 /* unlock now we've grabbed the inodes. */ 159 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 160 rcu_read_unlock();
133 161
134 for (i = 0; i < nr_found; i++) { 162 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 163 if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 667 struct xfs_inode *ip,
640 int flags) 668 int flags)
641{ 669{
670 ASSERT(rcu_read_lock_held());
671
672 /* quick check for stale RCU freed inode */
673 if (!ip->i_ino)
674 return 1;
642 675
643 /* 676 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 677 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 678 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 679 * check. Only do these checks if we are not going to block on locks.
647 */ 680 */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 687 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 688 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 689 * XFS_IRECLAIM flag set it will not touch us.
690 *
691 * Due to RCU lookup, we may find inodes that have been freed and only
692 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
693 * aren't candidates for reclaim at all, so we must check the
694 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 695 */
658 spin_lock(&ip->i_flags_lock); 696 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 697 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 698 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 699 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 700 spin_unlock(&ip->i_flags_lock);
663 return 1; 701 return 1;
664 } 702 }
@@ -864,14 +902,14 @@ restart:
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 902 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i; 903 int i;
866 904
867 write_lock(&pag->pag_ici_lock); 905 rcu_read_lock();
868 nr_found = radix_tree_gang_lookup_tag( 906 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root, 907 &pag->pag_ici_root,
870 (void **)batch, first_index, 908 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH, 909 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG); 910 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) { 911 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock); 912 rcu_read_unlock();
875 break; 913 break;
876 } 914 }
877 915
@@ -891,14 +929,24 @@ restart:
891 * occur if we have inodes in the last block of 929 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the 930 * the AG and we are currently pointing to the
893 * last inode. 931 * last inode.
932 *
933 * Because we may see inodes that are from the
934 * wrong AG due to RCU freeing and
935 * reallocation, only update the index if it
936 * lies in this AG. It was a race that lead us
937 * to see this inode, so another lookup from
938 * the same index will not find it again.
894 */ 939 */
940 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
941 pag->pag_agno)
942 continue;
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 943 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 944 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1; 945 done = 1;
898 } 946 }
899 947
900 /* unlock now we've grabbed the inodes. */ 948 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock); 949 rcu_read_unlock();
902 950
903 for (i = 0; i < nr_found; i++) { 951 for (i = 0; i < nr_found; i++) {
904 if (!batch[i]) 952 if (!batch[i])
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 9fae47556604..04ed09b907b8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
80 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
81 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
82 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
83 84
84 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
85 lockdep_set_class_and_name(&ip->i_iolock.mr_lock, 86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
98 ip->i_size = 0; 99 ip->i_size = 0;
99 ip->i_new_size = 0; 100 ip->i_new_size = 0;
100 101
101 /* prevent anyone from using this yet */
102 VFS_I(ip)->i_state = I_NEW;
103
104 return ip; 102 return ip;
105} 103}
106 104
@@ -159,6 +157,16 @@ xfs_inode_free(
159 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 157 ASSERT(!spin_is_locked(&ip->i_flags_lock));
160 ASSERT(completion_done(&ip->i_flush)); 158 ASSERT(completion_done(&ip->i_flush));
161 159
160 /*
161 * Because we use RCU freeing we need to ensure the inode always
162 * appears to be reclaimed with an invalid inode number when in the
163 * free state. The ip->i_flags_lock provides the barrier against lookup
164 * races.
165 */
166 spin_lock(&ip->i_flags_lock);
167 ip->i_flags = XFS_IRECLAIM;
168 ip->i_ino = 0;
169 spin_unlock(&ip->i_flags_lock);
162 call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free); 170 call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
163} 171}
164 172
@@ -169,14 +177,29 @@ static int
169xfs_iget_cache_hit( 177xfs_iget_cache_hit(
170 struct xfs_perag *pag, 178 struct xfs_perag *pag,
171 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
172 int flags, 181 int flags,
173 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
174{ 183{
175 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
176 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
177 int error; 186 int error;
178 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
179 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
180 203
181 /* 204 /*
182 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
219 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
220 243
221 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
222 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
223 246
224 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
225 if (error) { 248 if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
227 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
228 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
229 */ 252 */
230 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
231 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
232 255
233 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
261 284
262 /* We've got a live one. */ 285 /* We've got a live one. */
263 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
264 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
265 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
266 } 289 }
267 290
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
275 298
276out_error: 299out_error:
277 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
278 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
279 return error; 302 return error;
280} 303}
281 304
@@ -397,7 +420,7 @@ xfs_iget(
397 xfs_agino_t agino; 420 xfs_agino_t agino;
398 421
399 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
400 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
401 return EINVAL; 424 return EINVAL;
402 425
403 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
406 429
407again: 430again:
408 error = 0; 431 error = 0;
409 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
410 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
411 434
412 if (ip) { 435 if (ip) {
413 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
414 if (error) 437 if (error)
415 goto out_error_or_again; 438 goto out_error_or_again;
416 } else { 439 } else {
417 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
418 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
419 442
420 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 108c7a085f94..43ffd9079106 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster(
2000 */ 2000 */
2001 for (i = 0; i < ninodes; i++) { 2001 for (i = 0; i < ninodes; i++) {
2002retry: 2002retry:
2003 read_lock(&pag->pag_ici_lock); 2003 rcu_read_lock();
2004 ip = radix_tree_lookup(&pag->pag_ici_root, 2004 ip = radix_tree_lookup(&pag->pag_ici_root,
2005 XFS_INO_TO_AGINO(mp, (inum + i))); 2005 XFS_INO_TO_AGINO(mp, (inum + i)));
2006 2006
2007 /* Inode not in memory or stale, nothing to do */ 2007 /* Inode not in memory, nothing to do */
2008 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2008 if (!ip) {
2009 read_unlock(&pag->pag_ici_lock); 2009 rcu_read_unlock();
2010 continue; 2010 continue;
2011 } 2011 }
2012 2012
2013 /* 2013 /*
2014 * because this is an RCU protected lookup, we could
2015 * find a recently freed or even reallocated inode
2016 * during the lookup. We need to check under the
2017 * i_flags_lock for a valid inode here. Skip it if it
2018 * is not valid, the wrong inode or stale.
2019 */
2020 spin_lock(&ip->i_flags_lock);
2021 if (ip->i_ino != inum + i ||
2022 __xfs_iflags_test(ip, XFS_ISTALE)) {
2023 spin_unlock(&ip->i_flags_lock);
2024 rcu_read_unlock();
2025 continue;
2026 }
2027 spin_unlock(&ip->i_flags_lock);
2028
2029 /*
2014 * Don't try to lock/unlock the current inode, but we 2030 * Don't try to lock/unlock the current inode, but we
2015 * _cannot_ skip the other inodes that we did not find 2031 * _cannot_ skip the other inodes that we did not find
2016 * in the list attached to the buffer and are not 2032 * in the list attached to the buffer and are not
@@ -2019,11 +2035,11 @@ retry:
2019 */ 2035 */
2020 if (ip != free_ip && 2036 if (ip != free_ip &&
2021 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2037 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2022 read_unlock(&pag->pag_ici_lock); 2038 rcu_read_unlock();
2023 delay(1); 2039 delay(1);
2024 goto retry; 2040 goto retry;
2025 } 2041 }
2026 read_unlock(&pag->pag_ici_lock); 2042 rcu_read_unlock();
2027 2043
2028 xfs_iflock(ip); 2044 xfs_iflock(ip);
2029 xfs_iflags_set(ip, XFS_ISTALE); 2045 xfs_iflags_set(ip, XFS_ISTALE);
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
2629 2645
2630 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 2646 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2631 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; 2647 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2632 read_lock(&pag->pag_ici_lock); 2648 rcu_read_lock();
2633 /* really need a gang lookup range call here */ 2649 /* really need a gang lookup range call here */
2634 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, 2650 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2635 first_index, inodes_per_cluster); 2651 first_index, inodes_per_cluster);
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
2640 iq = ilist[i]; 2656 iq = ilist[i];
2641 if (iq == ip) 2657 if (iq == ip)
2642 continue; 2658 continue;
2643 /* if the inode lies outside this cluster, we're done. */ 2659
2644 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) 2660 /*
2645 break; 2661 * because this is an RCU protected lookup, we could find a
2662 * recently freed or even reallocated inode during the lookup.
2663 * We need to check under the i_flags_lock for a valid inode
2664 * here. Skip it if it is not valid or the wrong inode.
2665 */
2666 spin_lock(&ip->i_flags_lock);
2667 if (!ip->i_ino ||
2668 (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2669 spin_unlock(&ip->i_flags_lock);
2670 continue;
2671 }
2672 spin_unlock(&ip->i_flags_lock);
2673
2646 /* 2674 /*
2647 * Do an un-protected check to see if the inode is dirty and 2675 * Do an un-protected check to see if the inode is dirty and
2648 * is a candidate for flushing. These checks will be repeated 2676 * is a candidate for flushing. These checks will be repeated
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
2692 } 2720 }
2693 2721
2694out_free: 2722out_free:
2695 read_unlock(&pag->pag_ici_lock); 2723 rcu_read_unlock();
2696 kmem_free(ilist); 2724 kmem_free(ilist);
2697out_put: 2725out_put:
2698 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
@@ -2704,7 +2732,7 @@ cluster_corrupt_out:
2704 * Corruption detected in the clustering loop. Invalidate the 2732 * Corruption detected in the clustering loop. Invalidate the
2705 * inode buffer and shut down the filesystem. 2733 * inode buffer and shut down the filesystem.
2706 */ 2734 */
2707 read_unlock(&pag->pag_ici_lock); 2735 rcu_read_unlock();
2708 /* 2736 /*
2709 * Clean up the buffer. If it was B_DELWRI, just release it -- 2737 * Clean up the buffer. If it was B_DELWRI, just release it --
2710 * brelse can handle it with no problems. If not, shut down the 2738 * brelse can handle it with no problems. If not, shut down the