aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-12-17 01:29:43 -0500
committerDave Chinner <david@fromorbit.com>2010-12-17 01:29:43 -0500
commit1a3e8f3da09c7082d25b512a0ffe569391e4c09a (patch)
treec717ebe79e1f969f929d1fe6fb044fb59114449f /fs/xfs/linux-2.6
parentd95b7aaf9ab6738bef1ebcc52ab66563085e44ac (diff)
xfs: convert inode cache lookups to use RCU locking
With delayed logging greatly increasing the sustained parallelism of inode operations, the inode cache locking is showing significant read vs write contention when inode reclaim runs at the same time as lookups. There is also a lot more write lock acquistions than there are read locks (4:1 ratio) so the read locking is not really buying us much in the way of parallelism. To avoid the read vs write contention, change the cache to use RCU locking on the read side. To avoid needing to RCU free every single inode, use the built in slab RCU freeing mechanism. This requires us to be able to detect lookups of freed inodes, so enѕure that ever freed inode has an inode number of zero and the XFS_IRECLAIM flag set. We already check the XFS_IRECLAIM flag in cache hit lookup path, but also add a check for a zero inode number as well. We canthen convert all the read locking lockups to use RCU read side locking and hence remove all read side locking. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c84
1 files changed, 66 insertions, 18 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index afb0d7cfad1c..fd38682da851 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
53{ 53{
54 struct inode *inode = VFS_I(ip); 54 struct inode *inode = VFS_I(ip);
55 55
56 ASSERT(rcu_read_lock_held());
57
58 /*
59 * check for stale RCU freed inode
60 *
61 * If the inode has been reallocated, it doesn't matter if it's not in
62 * the AG we are walking - we are walking for writeback, so if it
63 * passes all the "valid inode" checks and is dirty, then we'll write
64 * it back anyway. If it has been reallocated and still being
65 * initialised, the XFS_INEW check below will catch it.
66 */
67 spin_lock(&ip->i_flags_lock);
68 if (!ip->i_ino)
69 goto out_unlock_noent;
70
71 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
72 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
73 goto out_unlock_noent;
74 spin_unlock(&ip->i_flags_lock);
75
56 /* nothing to sync during shutdown */ 76 /* nothing to sync during shutdown */
57 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 77 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
58 return EFSCORRUPTED; 78 return EFSCORRUPTED;
59 79
60 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
61 if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
62 return ENOENT;
63
64 /* If we can't grab the inode, it must on it's way to reclaim. */ 80 /* If we can't grab the inode, it must on it's way to reclaim. */
65 if (!igrab(inode)) 81 if (!igrab(inode))
66 return ENOENT; 82 return ENOENT;
@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
72 88
73 /* inode is valid */ 89 /* inode is valid */
74 return 0; 90 return 0;
91
92out_unlock_noent:
93 spin_unlock(&ip->i_flags_lock);
94 return ENOENT;
75} 95}
76 96
77STATIC int 97STATIC int
@@ -98,12 +118,12 @@ restart:
98 int error = 0; 118 int error = 0;
99 int i; 119 int i;
100 120
101 read_lock(&pag->pag_ici_lock); 121 rcu_read_lock();
102 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 122 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
103 (void **)batch, first_index, 123 (void **)batch, first_index,
104 XFS_LOOKUP_BATCH); 124 XFS_LOOKUP_BATCH);
105 if (!nr_found) { 125 if (!nr_found) {
106 read_unlock(&pag->pag_ici_lock); 126 rcu_read_unlock();
107 break; 127 break;
108 } 128 }
109 129
@@ -118,18 +138,26 @@ restart:
118 batch[i] = NULL; 138 batch[i] = NULL;
119 139
120 /* 140 /*
121 * Update the index for the next lookup. Catch overflows 141 * Update the index for the next lookup. Catch
122 * into the next AG range which can occur if we have inodes 142 * overflows into the next AG range which can occur if
123 * in the last block of the AG and we are currently 143 * we have inodes in the last block of the AG and we
124 * pointing to the last inode. 144 * are currently pointing to the last inode.
145 *
146 * Because we may see inodes that are from the wrong AG
147 * due to RCU freeing and reallocation, only update the
148 * index if it lies in this AG. It was a race that lead
149 * us to see this inode, so another lookup from the
150 * same index will not find it again.
125 */ 151 */
152 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
153 continue;
126 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 154 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
127 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 155 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
128 done = 1; 156 done = 1;
129 } 157 }
130 158
131 /* unlock now we've grabbed the inodes. */ 159 /* unlock now we've grabbed the inodes. */
132 read_unlock(&pag->pag_ici_lock); 160 rcu_read_unlock();
133 161
134 for (i = 0; i < nr_found; i++) { 162 for (i = 0; i < nr_found; i++) {
135 if (!batch[i]) 163 if (!batch[i])
@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
639 struct xfs_inode *ip, 667 struct xfs_inode *ip,
640 int flags) 668 int flags)
641{ 669{
670 ASSERT(rcu_read_lock_held());
671
672 /* quick check for stale RCU freed inode */
673 if (!ip->i_ino)
674 return 1;
642 675
643 /* 676 /*
644 * do some unlocked checks first to avoid unnecceary lock traffic. 677 * do some unlocked checks first to avoid unnecessary lock traffic.
645 * The first is a flush lock check, the second is a already in reclaim 678 * The first is a flush lock check, the second is a already in reclaim
646 * check. Only do these checks if we are not going to block on locks. 679 * check. Only do these checks if we are not going to block on locks.
647 */ 680 */
@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
654 * The radix tree lock here protects a thread in xfs_iget from racing 687 * The radix tree lock here protects a thread in xfs_iget from racing
655 * with us starting reclaim on the inode. Once we have the 688 * with us starting reclaim on the inode. Once we have the
656 * XFS_IRECLAIM flag set it will not touch us. 689 * XFS_IRECLAIM flag set it will not touch us.
690 *
691 * Due to RCU lookup, we may find inodes that have been freed and only
692 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
693 * aren't candidates for reclaim at all, so we must check the
694 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
657 */ 695 */
658 spin_lock(&ip->i_flags_lock); 696 spin_lock(&ip->i_flags_lock);
659 ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); 697 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
660 if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { 698 __xfs_iflags_test(ip, XFS_IRECLAIM)) {
661 /* ignore as it is already under reclaim */ 699 /* not a reclaim candidate. */
662 spin_unlock(&ip->i_flags_lock); 700 spin_unlock(&ip->i_flags_lock);
663 return 1; 701 return 1;
664 } 702 }
@@ -864,14 +902,14 @@ restart:
864 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 902 struct xfs_inode *batch[XFS_LOOKUP_BATCH];
865 int i; 903 int i;
866 904
867 write_lock(&pag->pag_ici_lock); 905 rcu_read_lock();
868 nr_found = radix_tree_gang_lookup_tag( 906 nr_found = radix_tree_gang_lookup_tag(
869 &pag->pag_ici_root, 907 &pag->pag_ici_root,
870 (void **)batch, first_index, 908 (void **)batch, first_index,
871 XFS_LOOKUP_BATCH, 909 XFS_LOOKUP_BATCH,
872 XFS_ICI_RECLAIM_TAG); 910 XFS_ICI_RECLAIM_TAG);
873 if (!nr_found) { 911 if (!nr_found) {
874 write_unlock(&pag->pag_ici_lock); 912 rcu_read_unlock();
875 break; 913 break;
876 } 914 }
877 915
@@ -891,14 +929,24 @@ restart:
891 * occur if we have inodes in the last block of 929 * occur if we have inodes in the last block of
892 * the AG and we are currently pointing to the 930 * the AG and we are currently pointing to the
893 * last inode. 931 * last inode.
932 *
933 * Because we may see inodes that are from the
934 * wrong AG due to RCU freeing and
935 * reallocation, only update the index if it
936 * lies in this AG. It was a race that lead us
937 * to see this inode, so another lookup from
938 * the same index will not find it again.
894 */ 939 */
940 if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
941 pag->pag_agno)
942 continue;
895 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 943 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
896 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 944 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
897 done = 1; 945 done = 1;
898 } 946 }
899 947
900 /* unlock now we've grabbed the inodes. */ 948 /* unlock now we've grabbed the inodes. */
901 write_unlock(&pag->pag_ici_lock); 949 rcu_read_unlock();
902 950
903 for (i = 0; i < nr_found; i++) { 951 for (i = 0; i < nr_found; i++) {
904 if (!batch[i]) 952 if (!batch[i])