aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_iget.c
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-12-17 01:29:43 -0500
committerDave Chinner <david@fromorbit.com>2010-12-17 01:29:43 -0500
commit1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
treec717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_iget.c
parentd95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)
xfs: convert inode cache lookups to use RCU locking
With delayed logging greatly increasing the sustained parallelism of inode operations, the inode cache locking is showing significant read vs write contention when inode reclaim runs at the same time as lookups. There is also a lot more write lock acquistions than there are read locks (4:1 ratio) so the read locking is not really buying us much in the way of parallelism. To avoid the read vs write contention, change the cache to use RCU locking on the read side. To avoid needing to RCU free every single inode, use the built in slab RCU freeing mechanism. This requires us to be able to detect lookups of freed inodes, so enѕure that ever freed inode has an inode number of zero and the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit lookup path, but also add a check for a zero inode number as well. We canthen convert all the read locking lockups to use RCU read side locking and hence remove all read side locking. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_iget.c')
-rw-r--r--fs/xfs/xfs_iget.c47
1 files changed, 35 insertions, 12 deletions
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 9fae47556604..04ed09b907b8 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -80,6 +80,7 @@ xfs_inode_alloc(
80 ASSERT(atomic_read(&ip->i_pincount) == 0); 80 ASSERT(atomic_read(&ip->i_pincount) == 0);
81 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 81 ASSERT(!spin_is_locked(&ip->i_flags_lock));
82 ASSERT(completion_done(&ip->i_flush)); 82 ASSERT(completion_done(&ip->i_flush));
83 ASSERT(ip->i_ino == 0);
83 84
84 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 85 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
85 lockdep_set_class_and_name(&ip->i_iolock.mr_lock, 86 lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
@@ -98,9 +99,6 @@ xfs_inode_alloc(
98 ip->i_size = 0; 99 ip->i_size = 0;
99 ip->i_new_size = 0; 100 ip->i_new_size = 0;
100 101
101 /* prevent anyone from using this yet */
102 VFS_I(ip)->i_state = I_NEW;
103
104 return ip; 102 return ip;
105} 103}
106 104
@@ -159,6 +157,16 @@ xfs_inode_free(
159 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 157 ASSERT(!spin_is_locked(&ip->i_flags_lock));
160 ASSERT(completion_done(&ip->i_flush)); 158 ASSERT(completion_done(&ip->i_flush));
161 159
160 /*
161 * Because we use RCU freeing we need to ensure the inode always
162 * appears to be reclaimed with an invalid inode number when in the
163 * free state. The ip->i_flags_lock provides the barrier against lookup
164 * races.
165 */
166 spin_lock(&ip->i_flags_lock);
167 ip->i_flags = XFS_IRECLAIM;
168 ip->i_ino = 0;
169 spin_unlock(&ip->i_flags_lock);
162 call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free); 170 call_rcu((struct rcu_head *)&VFS_I(ip)->i_dentry, __xfs_inode_free);
163} 171}
164 172
@@ -169,14 +177,29 @@ static int
169xfs_iget_cache_hit( 177xfs_iget_cache_hit(
170 struct xfs_perag *pag, 178 struct xfs_perag *pag,
171 struct xfs_inode *ip, 179 struct xfs_inode *ip,
180 xfs_ino_t ino,
172 int flags, 181 int flags,
173 int lock_flags) __releases(pag->pag_ici_lock) 182 int lock_flags) __releases(RCU)
174{ 183{
175 struct inode *inode = VFS_I(ip); 184 struct inode *inode = VFS_I(ip);
176 struct xfs_mount *mp = ip->i_mount; 185 struct xfs_mount *mp = ip->i_mount;
177 int error; 186 int error;
178 187
188 /*
189 * check for re-use of an inode within an RCU grace period due to the
190 * radix tree nodes not being updated yet. We monitor for this by
191 * setting the inode number to zero before freeing the inode structure.
192 * If the inode has been reallocated and set up, then the inode number
193 * will not match, so check for that, too.
194 */
179 spin_lock(&ip->i_flags_lock); 195 spin_lock(&ip->i_flags_lock);
196 if (ip->i_ino != ino) {
197 trace_xfs_iget_skip(ip);
198 XFS_STATS_INC(xs_ig_frecycle);
199 error = EAGAIN;
200 goto out_error;
201 }
202
180 203
181 /* 204 /*
182 * If we are racing with another cache hit that is currently 205 * If we are racing with another cache hit that is currently
@@ -219,7 +242,7 @@ xfs_iget_cache_hit(
219 ip->i_flags |= XFS_IRECLAIM; 242 ip->i_flags |= XFS_IRECLAIM;
220 243
221 spin_unlock(&ip->i_flags_lock); 244 spin_unlock(&ip->i_flags_lock);
222 read_unlock(&pag->pag_ici_lock); 245 rcu_read_unlock();
223 246
224 error = -inode_init_always(mp->m_super, inode); 247 error = -inode_init_always(mp->m_super, inode);
225 if (error) { 248 if (error) {
@@ -227,7 +250,7 @@ xfs_iget_cache_hit(
227 * Re-initializing the inode failed, and we are in deep 250 * Re-initializing the inode failed, and we are in deep
228 * trouble. Try to re-add it to the reclaim list. 251 * trouble. Try to re-add it to the reclaim list.
229 */ 252 */
230 read_lock(&pag->pag_ici_lock); 253 rcu_read_lock();
231 spin_lock(&ip->i_flags_lock); 254 spin_lock(&ip->i_flags_lock);
232 255
233 ip->i_flags &= ~XFS_INEW; 256 ip->i_flags &= ~XFS_INEW;
@@ -261,7 +284,7 @@ xfs_iget_cache_hit(
261 284
262 /* We've got a live one. */ 285 /* We've got a live one. */
263 spin_unlock(&ip->i_flags_lock); 286 spin_unlock(&ip->i_flags_lock);
264 read_unlock(&pag->pag_ici_lock); 287 rcu_read_unlock();
265 trace_xfs_iget_hit(ip); 288 trace_xfs_iget_hit(ip);
266 } 289 }
267 290
@@ -275,7 +298,7 @@ xfs_iget_cache_hit(
275 298
276out_error: 299out_error:
277 spin_unlock(&ip->i_flags_lock); 300 spin_unlock(&ip->i_flags_lock);
278 read_unlock(&pag->pag_ici_lock); 301 rcu_read_unlock();
279 return error; 302 return error;
280} 303}
281 304
@@ -397,7 +420,7 @@ xfs_iget(
397 xfs_agino_t agino; 420 xfs_agino_t agino;
398 421
399 /* reject inode numbers outside existing AGs */ 422 /* reject inode numbers outside existing AGs */
400 if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 423 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
401 return EINVAL; 424 return EINVAL;
402 425
403 /* get the perag structure and ensure that it's inode capable */ 426 /* get the perag structure and ensure that it's inode capable */
@@ -406,15 +429,15 @@ xfs_iget(
406 429
407again: 430again:
408 error = 0; 431 error = 0;
409 read_lock(&pag->pag_ici_lock); 432 rcu_read_lock();
410 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 433 ip = radix_tree_lookup(&pag->pag_ici_root, agino);
411 434
412 if (ip) { 435 if (ip) {
413 error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); 436 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
414 if (error) 437 if (error)
415 goto out_error_or_again; 438 goto out_error_or_again;
416 } else { 439 } else {
417 read_unlock(&pag->pag_ici_lock); 440 rcu_read_unlock();
418 XFS_STATS_INC(xs_ig_missed); 441 XFS_STATS_INC(xs_ig_missed);
419 442
420 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 443 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,