aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2010-07-19 19:43:39 -0400
committerDave Chinner <david@fromorbit.com>2010-07-19 19:43:39 -0400
commit16fd5367370099b59d96e30bb7d9de8d419659f2 (patch)
treea9c12adb84f363ac48a0846de7a9464b1631e464 /fs
parent70e60ce71516c3a9e882edb70a09f696a05961db (diff)
xfs: track AGs with reclaimable inodes in per-ag radix tree
https://bugzilla.kernel.org/show_bug.cgi?id=16348 When the filesystem grows to a large number of allocation groups, the summing of recalimable inodes gets expensive. In many cases, most AGs won't have any reclaimable inodes and so we are wasting CPU time aggregating over these AGs. This is particularly important for the inode shrinker that gets called frequently under memory pressure. To avoid the overhead, track AGs with reclaimable inodes in the per-ag radix tree so that we can find all the AGs with reclaimable inodes via a simple gang tag lookup. This involves setting the tag when the first reclaimable inode is tracked in the AG, and removing the tag when the last reclaimable inode is removed from the tree. Then the summation process becomes a loop walking the radix tree summing AGs with the reclaim tag set. This significantly reduces the overhead of scanning - a 6400 AG filesystea now only uses about 25% of a cpu in kswapd while slab reclaim progresses instead of being permanently stuck at 100% CPU and making little progress. Clean filesystems filesystems will see no overhead and the overhead only increases linearly with the number of dirty AGs. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c71
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h3
2 files changed, 67 insertions, 7 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f433819611c..a51a07c3a70 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -144,6 +144,41 @@ restart:
144 return last_error; 144 return last_error;
145} 145}
146 146
147/*
148 * Select the next per-ag structure to iterate during the walk. The reclaim
149 * walk is optimised only to walk AGs with reclaimable inodes in them.
150 */
151static struct xfs_perag *
152xfs_inode_ag_iter_next_pag(
153 struct xfs_mount *mp,
154 xfs_agnumber_t *first,
155 int tag)
156{
157 struct xfs_perag *pag = NULL;
158
159 if (tag == XFS_ICI_RECLAIM_TAG) {
160 int found;
161 int ref;
162
163 spin_lock(&mp->m_perag_lock);
164 found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
165 (void **)&pag, *first, 1, tag);
166 if (found <= 0) {
167 spin_unlock(&mp->m_perag_lock);
168 return NULL;
169 }
170 *first = pag->pag_agno + 1;
171 /* open coded pag reference increment */
172 ref = atomic_inc_return(&pag->pag_ref);
173 spin_unlock(&mp->m_perag_lock);
174 trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
175 } else {
176 pag = xfs_perag_get(mp, *first);
177 (*first)++;
178 }
179 return pag;
180}
181
147int 182int
148xfs_inode_ag_iterator( 183xfs_inode_ag_iterator(
149 struct xfs_mount *mp, 184 struct xfs_mount *mp,
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
154 int exclusive, 189 int exclusive,
155 int *nr_to_scan) 190 int *nr_to_scan)
156{ 191{
192 struct xfs_perag *pag;
157 int error = 0; 193 int error = 0;
158 int last_error = 0; 194 int last_error = 0;
159 xfs_agnumber_t ag; 195 xfs_agnumber_t ag;
160 int nr; 196 int nr;
161 197
162 nr = nr_to_scan ? *nr_to_scan : INT_MAX; 198 nr = nr_to_scan ? *nr_to_scan : INT_MAX;
163 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 199 ag = 0;
164 struct xfs_perag *pag; 200 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
165
166 pag = xfs_perag_get(mp, ag);
167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 201 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
168 exclusive, &nr); 202 exclusive, &nr);
169 xfs_perag_put(pag); 203 xfs_perag_put(pag);
@@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag(
640 radix_tree_tag_set(&pag->pag_ici_root, 674 radix_tree_tag_set(&pag->pag_ici_root,
641 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 675 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
642 XFS_ICI_RECLAIM_TAG); 676 XFS_ICI_RECLAIM_TAG);
677
678 if (!pag->pag_ici_reclaimable) {
679 /* propagate the reclaim tag up into the perag radix tree */
680 spin_lock(&ip->i_mount->m_perag_lock);
681 radix_tree_tag_set(&ip->i_mount->m_perag_tree,
682 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
683 XFS_ICI_RECLAIM_TAG);
684 spin_unlock(&ip->i_mount->m_perag_lock);
685 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
686 -1, _RET_IP_);
687 }
643 pag->pag_ici_reclaimable++; 688 pag->pag_ici_reclaimable++;
644} 689}
645 690
@@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag(
674 radix_tree_tag_clear(&pag->pag_ici_root, 719 radix_tree_tag_clear(&pag->pag_ici_root,
675 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 720 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
676 pag->pag_ici_reclaimable--; 721 pag->pag_ici_reclaimable--;
722 if (!pag->pag_ici_reclaimable) {
723 /* clear the reclaim tag from the perag radix tree */
724 spin_lock(&ip->i_mount->m_perag_lock);
725 radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
726 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
727 XFS_ICI_RECLAIM_TAG);
728 spin_unlock(&ip->i_mount->m_perag_lock);
729 trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
730 -1, _RET_IP_);
731 }
677} 732}
678 733
679/* 734/*
@@ -838,7 +893,7 @@ xfs_reclaim_inode_shrink(
838 struct xfs_mount *mp; 893 struct xfs_mount *mp;
839 struct xfs_perag *pag; 894 struct xfs_perag *pag;
840 xfs_agnumber_t ag; 895 xfs_agnumber_t ag;
841 int reclaimable = 0; 896 int reclaimable;
842 897
843 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 898 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
844 if (nr_to_scan) { 899 if (nr_to_scan) {
@@ -852,8 +907,10 @@ xfs_reclaim_inode_shrink(
852 return -1; 907 return -1;
853 } 908 }
854 909
855 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 910 reclaimable = 0;
856 pag = xfs_perag_get(mp, ag); 911 ag = 0;
912 while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
913 XFS_ICI_RECLAIM_TAG))) {
857 reclaimable += pag->pag_ici_reclaimable; 914 reclaimable += pag->pag_ici_reclaimable;
858 xfs_perag_put(pag); 915 xfs_perag_put(pag);
859 } 916 }
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa11738..30282069090 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \
124 unsigned long caller_ip), \ 124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip)) 125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get); 126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
127DEFINE_PERAG_REF_EVENT(xfs_perag_put); 128DEFINE_PERAG_REF_EVENT(xfs_perag_put);
129DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
130DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
128 131
129TRACE_EVENT(xfs_attr_list_node_descend, 132TRACE_EVENT(xfs_attr_list_node_descend,
130 TP_PROTO(struct xfs_attr_list_context *ctx, 133 TP_PROTO(struct xfs_attr_list_context *ctx,