diff options
author | Dave Chinner <dchinner@redhat.com> | 2010-12-17 01:29:43 -0500 |
---|---|---|
committer | Dave Chinner <david@fromorbit.com> | 2010-12-17 01:29:43 -0500 |
commit | 1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch) | |
tree | c717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/xfs_inode.c | |
parent | d95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff) |
xfs: convert inode cache lookups to use RCU locking
With delayed logging greatly increasing the sustained parallelism of inode
operations, the inode cache locking is showing significant read vs write
contention when inode reclaim runs at the same time as lookups. There is
also a lot more write lock acquistions than there are read locks (4:1 ratio)
so the read locking is not really buying us much in the way of parallelism.
To avoid the read vs write contention, change the cache to use RCU locking on
the read side. To avoid needing to RCU free every single inode, use the built
in slab RCU freeing mechanism. This requires us to be able to detect lookups of
freed inodes, so enѕure that ever freed inode has an inode number of zero and
the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit
lookup path, but also add a check for a zero inode number as well.
We canthen convert all the read locking lockups to use RCU read side locking
and hence remove all read side locking.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 52 |
1 files changed, 40 insertions, 12 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 108c7a085f94..43ffd9079106 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -2000,17 +2000,33 @@ xfs_ifree_cluster( | |||
2000 | */ | 2000 | */ |
2001 | for (i = 0; i < ninodes; i++) { | 2001 | for (i = 0; i < ninodes; i++) { |
2002 | retry: | 2002 | retry: |
2003 | read_lock(&pag->pag_ici_lock); | 2003 | rcu_read_lock(); |
2004 | ip = radix_tree_lookup(&pag->pag_ici_root, | 2004 | ip = radix_tree_lookup(&pag->pag_ici_root, |
2005 | XFS_INO_TO_AGINO(mp, (inum + i))); | 2005 | XFS_INO_TO_AGINO(mp, (inum + i))); |
2006 | 2006 | ||
2007 | /* Inode not in memory or stale, nothing to do */ | 2007 | /* Inode not in memory, nothing to do */ |
2008 | if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { | 2008 | if (!ip) { |
2009 | read_unlock(&pag->pag_ici_lock); | 2009 | rcu_read_unlock(); |
2010 | continue; | 2010 | continue; |
2011 | } | 2011 | } |
2012 | 2012 | ||
2013 | /* | 2013 | /* |
2014 | * because this is an RCU protected lookup, we could | ||
2015 | * find a recently freed or even reallocated inode | ||
2016 | * during the lookup. We need to check under the | ||
2017 | * i_flags_lock for a valid inode here. Skip it if it | ||
2018 | * is not valid, the wrong inode or stale. | ||
2019 | */ | ||
2020 | spin_lock(&ip->i_flags_lock); | ||
2021 | if (ip->i_ino != inum + i || | ||
2022 | __xfs_iflags_test(ip, XFS_ISTALE)) { | ||
2023 | spin_unlock(&ip->i_flags_lock); | ||
2024 | rcu_read_unlock(); | ||
2025 | continue; | ||
2026 | } | ||
2027 | spin_unlock(&ip->i_flags_lock); | ||
2028 | |||
2029 | /* | ||
2014 | * Don't try to lock/unlock the current inode, but we | 2030 | * Don't try to lock/unlock the current inode, but we |
2015 | * _cannot_ skip the other inodes that we did not find | 2031 | * _cannot_ skip the other inodes that we did not find |
2016 | * in the list attached to the buffer and are not | 2032 | * in the list attached to the buffer and are not |
@@ -2019,11 +2035,11 @@ retry: | |||
2019 | */ | 2035 | */ |
2020 | if (ip != free_ip && | 2036 | if (ip != free_ip && |
2021 | !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { | 2037 | !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { |
2022 | read_unlock(&pag->pag_ici_lock); | 2038 | rcu_read_unlock(); |
2023 | delay(1); | 2039 | delay(1); |
2024 | goto retry; | 2040 | goto retry; |
2025 | } | 2041 | } |
2026 | read_unlock(&pag->pag_ici_lock); | 2042 | rcu_read_unlock(); |
2027 | 2043 | ||
2028 | xfs_iflock(ip); | 2044 | xfs_iflock(ip); |
2029 | xfs_iflags_set(ip, XFS_ISTALE); | 2045 | xfs_iflags_set(ip, XFS_ISTALE); |
@@ -2629,7 +2645,7 @@ xfs_iflush_cluster( | |||
2629 | 2645 | ||
2630 | mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); | 2646 | mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); |
2631 | first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; | 2647 | first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; |
2632 | read_lock(&pag->pag_ici_lock); | 2648 | rcu_read_lock(); |
2633 | /* really need a gang lookup range call here */ | 2649 | /* really need a gang lookup range call here */ |
2634 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, | 2650 | nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, |
2635 | first_index, inodes_per_cluster); | 2651 | first_index, inodes_per_cluster); |
@@ -2640,9 +2656,21 @@ xfs_iflush_cluster( | |||
2640 | iq = ilist[i]; | 2656 | iq = ilist[i]; |
2641 | if (iq == ip) | 2657 | if (iq == ip) |
2642 | continue; | 2658 | continue; |
2643 | /* if the inode lies outside this cluster, we're done. */ | 2659 | |
2644 | if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) | 2660 | /* |
2645 | break; | 2661 | * because this is an RCU protected lookup, we could find a |
2662 | * recently freed or even reallocated inode during the lookup. | ||
2663 | * We need to check under the i_flags_lock for a valid inode | ||
2664 | * here. Skip it if it is not valid or the wrong inode. | ||
2665 | */ | ||
2666 | spin_lock(&ip->i_flags_lock); | ||
2667 | if (!ip->i_ino || | ||
2668 | (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { | ||
2669 | spin_unlock(&ip->i_flags_lock); | ||
2670 | continue; | ||
2671 | } | ||
2672 | spin_unlock(&ip->i_flags_lock); | ||
2673 | |||
2646 | /* | 2674 | /* |
2647 | * Do an un-protected check to see if the inode is dirty and | 2675 | * Do an un-protected check to see if the inode is dirty and |
2648 | * is a candidate for flushing. These checks will be repeated | 2676 | * is a candidate for flushing. These checks will be repeated |
@@ -2692,7 +2720,7 @@ xfs_iflush_cluster( | |||
2692 | } | 2720 | } |
2693 | 2721 | ||
2694 | out_free: | 2722 | out_free: |
2695 | read_unlock(&pag->pag_ici_lock); | 2723 | rcu_read_unlock(); |
2696 | kmem_free(ilist); | 2724 | kmem_free(ilist); |
2697 | out_put: | 2725 | out_put: |
2698 | xfs_perag_put(pag); | 2726 | xfs_perag_put(pag); |
@@ -2704,7 +2732,7 @@ cluster_corrupt_out: | |||
2704 | * Corruption detected in the clustering loop. Invalidate the | 2732 | * Corruption detected in the clustering loop. Invalidate the |
2705 | * inode buffer and shut down the filesystem. | 2733 | * inode buffer and shut down the filesystem. |
2706 | */ | 2734 | */ |
2707 | read_unlock(&pag->pag_ici_lock); | 2735 | rcu_read_unlock(); |
2708 | /* | 2736 | /* |
2709 | * Clean up the buffer. If it was B_DELWRI, just release it -- | 2737 | * Clean up the buffer. If it was B_DELWRI, just release it -- |
2710 | * brelse can handle it with no problems. If not, shut down the | 2738 | * brelse can handle it with no problems. If not, shut down the |