xfs: rename xfs_sync.[ch] to xfs_icache.[ch]

xfs_sync.c now only contains inode reclaim functions and inode cache iteration functions. It is not related to sync operations anymore. Rename to xfs_icache.c to reflect it's contents and prepare for consolidation with the other inode cache file that exists (xfs_iget.c). Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2012-10-08 06:56:09 -0400
committer: Ben Myers <bpm@sgi.com> 2012-10-17 14:40:09 -0400
commit: 6d8b79cfca39399ef9115fb65dde85993455c9a3 (patch)
tree: c4702e765ee5b3d10f496c42148e317d7ee98ed8 /fs/xfs/xfs_icache.c
parent: c75921a72a7c4bb73a5e09a697a672722e5543f1 (diff)
1 files changed, 715 insertions, 0 deletions
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 000000000000..eba216f11d5e
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH        32
+STATIC int
+xfs_inode_ag_walk_grab(
+        struct xfs_inode        *ip)
+{
+        struct inode            *inode = VFS_I(ip);
+        ASSERT(rcu_read_lock_held());
+        /*
+         * check for stale RCU freed inode
+         *
+         * If the inode has been reallocated, it doesn't matter if it's not in
+         * the AG we are walking - we are walking for writeback, so if it
+         * passes all the "valid inode" checks and is dirty, then we'll write
+         * it back anyway.  If it has been reallocated and still being
+         * initialised, the XFS_INEW check below will catch it.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!ip->i_ino)
+                goto out_unlock_noent;
+        /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+        if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+                goto out_unlock_noent;
+        spin_unlock(&ip->i_flags_lock);
+        /* nothing to sync during shutdown */
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return EFSCORRUPTED;
+        /* If we can't grab the inode, it must on it's way to reclaim. */
+        if (!igrab(inode))
+                return ENOENT;
+        if (is_bad_inode(inode)) {
+                IRELE(ip);
+                return ENOENT;
+        }
+        /* inode is valid */
+        return 0;
+out_unlock_noent:
+        spin_unlock(&ip->i_flags_lock);
+        return ENOENT;
+}
+STATIC int
+xfs_inode_ag_walk(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        int                     (*execute)(struct xfs_inode *ip,
+                                           struct xfs_perag *pag, int flags),
+        int                     flags)
+{
+        uint32_t                first_index;
+        int                     last_error = 0;
+        int                     skipped;
+        int                     done;
+        int                     nr_found;
+restart:
+        done = 0;
+        skipped = 0;
+        first_index = 0;
+        nr_found = 0;
+        do {
+                struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                int             error = 0;
+                int             i;
+                rcu_read_lock();
+                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                        (void **)batch, first_index,
+                                        XFS_LOOKUP_BATCH);
+                if (!nr_found) {
+                        rcu_read_unlock();
+                        break;
+                }
+                /*
+                 * Grab the inodes before we drop the lock. if we found
+                 * nothing, nr == 0 and the loop will be skipped.
+                 */
+                for (i = 0; i < nr_found; i++) {
+                        struct xfs_inode *ip = batch[i];
+                        if (done || xfs_inode_ag_walk_grab(ip))
+                                batch[i] = NULL;
+                        /*
+                         * Update the index for the next lookup. Catch
+                         * overflows into the next AG range which can occur if
+                         * we have inodes in the last block of the AG and we
+                         * are currently pointing to the last inode.
+                         *
+                         * Because we may see inodes that are from the wrong AG
+                         * due to RCU freeing and reallocation, only update the
+                         * index if it lies in this AG. It was a race that lead
+                         * us to see this inode, so another lookup from the
+                         * same index will not find it again.
+                         */
+                        if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                                continue;
+                        first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                        if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                done = 1;
+                }
+                /* unlock now we've grabbed the inodes. */
+                rcu_read_unlock();
+                for (i = 0; i < nr_found; i++) {
+                        if (!batch[i])
+                                continue;
+                        error = execute(batch[i], pag, flags);
+                        IRELE(batch[i]);
+                        if (error == EAGAIN) {
+                                skipped++;
+                                continue;
+                        }
+                        if (error && last_error != EFSCORRUPTED)
+                                last_error = error;
+                }
+                /* bail out if the filesystem is corrupted.  */
+                if (error == EFSCORRUPTED)
+                        break;
+                cond_resched();
+        } while (nr_found && !done);
+        if (skipped) {
+                delay(1);
+                goto restart;
+        }
+        return last_error;
+}
+int
+xfs_inode_ag_iterator(
+        struct xfs_mount        *mp,
+        int                     (*execute)(struct xfs_inode *ip,
+                                           struct xfs_perag *pag, int flags),
+        int                     flags)
+{
+        struct xfs_perag        *pag;
+        int                     error = 0;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
+        ag = 0;
+        while ((pag = xfs_perag_get(mp, ag))) {
+                ag = pag->pag_agno + 1;
+                error = xfs_inode_ag_walk(mp, pag, execute, flags);
+                xfs_perag_put(pag);
+                if (error) {
+                        last_error = error;
+                        if (error == EFSCORRUPTED)
+                                break;
+                }
+        }
+        return XFS_ERROR(last_error);
+}
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+        struct xfs_mount        *mp)
+{
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+        }
+        rcu_read_unlock();
+}
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_reclaim_work_queue(mp);
+}
+void
+__xfs_inode_set_reclaim_tag(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip)
+{
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_RECLAIM_TAG);
+        if (!pag->pag_ici_reclaimable) {
+                /* propagate the reclaim tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_reclaim_work_queue(ip->i_mount);
+                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
+        pag->pag_ici_reclaimable++;
+}
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+        xfs_inode_t     *ip)
+{
+        struct xfs_mount *mp = ip->i_mount;
+        struct xfs_perag *pag;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        spin_lock(&ip->i_flags_lock);
+        __xfs_inode_set_reclaim_tag(pag, ip);
+        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+        spin_unlock(&ip->i_flags_lock);
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
+STATIC void
+__xfs_inode_clear_reclaim(
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        pag->pag_ici_reclaimable--;
+        if (!pag->pag_ici_reclaimable) {
+                /* clear the reclaim tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                XFS_ICI_RECLAIM_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                        -1, _RET_IP_);
+        }
+}
+void
+__xfs_inode_clear_reclaim_tag(
+        xfs_mount_t     *mp,
+        xfs_perag_t     *pag,
+        xfs_inode_t     *ip)
+{
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        __xfs_inode_clear_reclaim(pag, ip);
+}
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+        struct xfs_inode        *ip,
+        int                     flags)
+{
+        ASSERT(rcu_read_lock_held());
+        /* quick check for stale RCU freed inode */
+        if (!ip->i_ino)
+                return 1;
+        /*
+         * If we are asked for non-blocking operation, do unlocked checks to
+         * see if the inode already is being flushed or in reclaim to avoid
+         * lock traffic.
+         */
+        if ((flags & SYNC_TRYLOCK) &&
+            __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+                return 1;
+        /*
+         * The radix tree lock here protects a thread in xfs_iget from racing
+         * with us starting reclaim on the inode.  Once we have the
+         * XFS_IRECLAIM flag set it will not touch us.
+         *
+         * Due to RCU lookup, we may find inodes that have been freed and only
+         * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+         * aren't candidates for reclaim at all, so we must check the
+         * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+            __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                /* not a reclaim candidate. */
+                spin_unlock(&ip->i_flags_lock);
+                return 1;
+        }
+        __xfs_iflags_set(ip, XFS_IRECLAIM);
+        spin_unlock(&ip->i_flags_lock);
+        return 0;
+}
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *      inode state          iflush ret         required action
+ *      ---------------      ----------         ---------------
+ *      bad                     -               reclaim
+ *      shutdown                EIO             unpin and reclaim
+ *      clean, unpinned         0               reclaim
+ *      stale, unpinned         0               reclaim
+ *      clean, pinned(*)        0               requeue
+ *      stale, pinned           EAGAIN          requeue
+ *      dirty, async            -               requeue
+ *      dirty, sync             0               reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *      bad             => reclaim
+ *      shutdown        => unpin and reclaim
+ *      pinned, async   => requeue
+ *      pinned, sync    => unpin
+ *      stale           => reclaim
+ *      clean           => reclaim
+ *      dirty, async    => requeue
+ *      dirty, sync     => flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag,
+        int                     sync_mode)
+{
+        struct xfs_buf          *bp = NULL;
+        int                     error;
+restart:
+        error = 0;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        if (!xfs_iflock_nowait(ip)) {
+                if (!(sync_mode & SYNC_WAIT))
+                        goto out;
+                xfs_iflock(ip);
+        }
+        if (is_bad_inode(VFS_I(ip)))
+                goto reclaim;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                xfs_iunpin_wait(ip);
+                xfs_iflush_abort(ip, false);
+                goto reclaim;
+        }
+        if (xfs_ipincount(ip)) {
+                if (!(sync_mode & SYNC_WAIT))
+                        goto out_ifunlock;
+                xfs_iunpin_wait(ip);
+        }
+        if (xfs_iflags_test(ip, XFS_ISTALE))
+                goto reclaim;
+        if (xfs_inode_clean(ip))
+                goto reclaim;
+        /*
+         * Never flush out dirty data during non-blocking reclaim, as it would
+         * just contend with AIL pushing trying to do the same job.
+         */
+        if (!(sync_mode & SYNC_WAIT))
+                goto out_ifunlock;
+        /*
+         * Now we have an inode that needs flushing.
+         *
+         * Note that xfs_iflush will never block on the inode buffer lock, as
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here.  As a result,
+         * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+         * result in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+         * inode, back off and try again.  Hopefully the next pass through will
+         * see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, &bp);
+        if (error == EAGAIN) {
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                /* backoff longer than in xfs_ifree_cluster */
+                delay(2);
+                goto restart;
+        }
+        if (!error) {
+                error = xfs_bwrite(bp);
+                xfs_buf_relse(bp);
+        }
+        xfs_iflock(ip);
+reclaim:
+        xfs_ifunlock(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        XFS_STATS_INC(xs_ig_reclaims);
+        /*
+         * Remove the inode from the per-AG radix tree.
+         *
+         * Because radix_tree_delete won't complain even if the item was never
+         * added to the tree assert that it's been there before to catch
+         * problems with the inode life time early on.
+         */
+        spin_lock(&pag->pag_ici_lock);
+        if (!radix_tree_delete(&pag->pag_ici_root,
+                                XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+                ASSERT(0);
+        __xfs_inode_clear_reclaim(pag, ip);
+        spin_unlock(&pag->pag_ici_lock);
+        /*
+         * Here we do an (almost) spurious inode lock in order to coordinate
+         * with inode cache radix tree lookups.  This is because the lookup
+         * can reference the inodes in the cache without taking references.
+         *
+         * We make that OK here by ensuring that we wait until the inode is
+         * unlocked after the lookup before we go ahead and free it.
+         */
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_qm_dqdetach(ip);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        xfs_inode_free(ip);
+        return error;
+out_ifunlock:
+        xfs_ifunlock(ip);
+out:
+        xfs_iflags_clear(ip, XFS_IRECLAIM);
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        /*
+         * We could return EAGAIN here to make reclaim rescan the inode tree in
+         * a short while. However, this just burns CPU time scanning the tree
+         * waiting for IO to complete and the reclaim work never goes back to
+         * the idle state. Instead, return 0 to let the next scheduled
+         * background reclaim attempt to reclaim the inode again.
+         */
+        return 0;
+}
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+        struct xfs_mount        *mp,
+        int                     flags,
+        int                     *nr_to_scan)
+{
+        struct xfs_perag        *pag;
+        int                     error = 0;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
+        int                     trylock = flags & SYNC_TRYLOCK;
+        int                     skipped;
+restart:
+        ag = 0;
+        skipped = 0;
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+                unsigned long   first_index = 0;
+                int             done = 0;
+                int             nr_found = 0;
+                ag = pag->pag_agno + 1;
+                if (trylock) {
+                        if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                                skipped++;
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        first_index = pag->pag_ici_reclaim_cursor;
+                } else
+                        mutex_lock(&pag->pag_ici_reclaim_lock);
+                do {
+                        struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                        int     i;
+                        rcu_read_lock();
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **)batch, first_index,
+                                        XFS_LOOKUP_BATCH,
+                                        XFS_ICI_RECLAIM_TAG);
+                        if (!nr_found) {
+                                done = 1;
+                                rcu_read_unlock();
+                                break;
+                        }
+                        /*
+                         * Grab the inodes before we drop the lock. if we found
+                         * nothing, nr == 0 and the loop will be skipped.
+                         */
+                        for (i = 0; i < nr_found; i++) {
+                                struct xfs_inode *ip = batch[i];
+                                if (done || xfs_reclaim_inode_grab(ip, flags))
+                                        batch[i] = NULL;
+                                /*
+                                 * Update the index for the next lookup. Catch
+                                 * overflows into the next AG range which can
+                                 * occur if we have inodes in the last block of
+                                 * the AG and we are currently pointing to the
+                                 * last inode.
+                                 *
+                                 * Because we may see inodes that are from the
+                                 * wrong AG due to RCU freeing and
+                                 * reallocation, only update the index if it
+                                 * lies in this AG. It was a race that lead us
+                                 * to see this inode, so another lookup from
+                                 * the same index will not find it again.
+                                 */
+                                if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                                pag->pag_agno)
+                                        continue;
+                                first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                                if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                        done = 1;
+                        }
+                        /* unlock now we've grabbed the inodes. */
+                        rcu_read_unlock();
+                        for (i = 0; i < nr_found; i++) {
+                                if (!batch[i])
+                                        continue;
+                                error = xfs_reclaim_inode(batch[i], pag, flags);
+                                if (error && last_error != EFSCORRUPTED)
+                                        last_error = error;
+                        }
+                        *nr_to_scan -= XFS_LOOKUP_BATCH;
+                        cond_resched();
+                } while (nr_found && !done && *nr_to_scan > 0);
+                if (trylock && !done)
+                        pag->pag_ici_reclaim_cursor = first_index;
+                else
+                        pag->pag_ici_reclaim_cursor = 0;
+                mutex_unlock(&pag->pag_ici_reclaim_lock);
+                xfs_perag_put(pag);
+        }
+        /*
+         * if we skipped any AG, and we still have scan count remaining, do
+         * another pass this time using blocking reclaim semantics (i.e
+         * waiting on the reclaim locks and ignoring the reclaim cursors). This
+         * ensure that when we get more reclaimers than AGs we block rather
+         * than spin trying to execute reclaim.
+         */
+        if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+                trylock = 0;
+                goto restart;
+        }
+        return XFS_ERROR(last_error);
+}
+int
+xfs_reclaim_inodes(
+        xfs_mount_t     *mp,
+        int             mode)
+{
+        int             nr_to_scan = INT_MAX;
+        return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+        struct xfs_mount        *mp,
+        int                     nr_to_scan)
+{
+        /* kick background reclaimer and push the AIL */
+        xfs_reclaim_work_queue(mp);
+        xfs_ail_push_all(mp->m_ail);
+        xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+        struct xfs_mount        *mp)
+{
+        struct xfs_perag        *pag;
+        xfs_agnumber_t          ag = 0;
+        int                     reclaimable = 0;
+        while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+                ag = pag->pag_agno + 1;
+                reclaimable += pag->pag_ici_reclaimable;
+                xfs_perag_put(pag);
+        }
+        return reclaimable;
+}
author	Dave Chinner <dchinner@redhat.com>	2012-10-08 06:56:09 -0400
committer	Ben Myers <bpm@sgi.com>	2012-10-17 14:40:09 -0400
commit	6d8b79cfca39399ef9115fb65dde85993455c9a3 (patch)
tree	c4702e765ee5b3d10f496c42148e317d7ee98ed8 /fs/xfs/xfs_icache.c
parent	c75921a72a7c4bb73a5e09a697a672722e5543f1 (diff)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c new file mode 100644 index 000000000000..eba216f11d5e --- /dev/null +++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,715 @@
	1	/*
	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
	3	* All Rights Reserved.
	4	*
	5	* This program is free software; you can redistribute it and/or
	6	* modify it under the terms of the GNU General Public License as
	7	* published by the Free Software Foundation.
	8	*
	9	* This program is distributed in the hope that it would be useful,
	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	* GNU General Public License for more details.
	13	*
	14	* You should have received a copy of the GNU General Public License
	15	* along with this program; if not, write the Free Software Foundation,
	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	17	*/
	18	#include "xfs.h"
	19	#include "xfs_fs.h"
	20	#include "xfs_types.h"
	21	#include "xfs_log.h"
	22	#include "xfs_log_priv.h"
	23	#include "xfs_inum.h"
	24	#include "xfs_trans.h"
	25	#include "xfs_trans_priv.h"
	26	#include "xfs_sb.h"
	27	#include "xfs_ag.h"
	28	#include "xfs_mount.h"
	29	#include "xfs_bmap_btree.h"
	30	#include "xfs_inode.h"
	31	#include "xfs_dinode.h"
	32	#include "xfs_error.h"
	33	#include "xfs_filestream.h"
	34	#include "xfs_vnodeops.h"
	35	#include "xfs_inode_item.h"
	36	#include "xfs_quota.h"
	37	#include "xfs_trace.h"
	38	#include "xfs_fsops.h"
	39	#include "xfs_icache.h"
	40
	41	#include <linux/kthread.h>
	42	#include <linux/freezer.h>
	43
	44	/*
	45	* The inode lookup is done in batches to keep the amount of lock traffic and
	46	* radix tree lookups to a minimum. The batch size is a trade off between
	47	* lookup reduction and stack usage. This is in the reclaim path, so we can't
	48	* be too greedy.
	49	*/
	50	#define XFS_LOOKUP_BATCH 32
	51
	52	STATIC int
	53	xfs_inode_ag_walk_grab(
	54	struct xfs_inode *ip)
	55	{
	56	struct inode *inode = VFS_I(ip);
	57
	58	ASSERT(rcu_read_lock_held());
	59
	60	/*
	61	* check for stale RCU freed inode
	62	*
	63	* If the inode has been reallocated, it doesn't matter if it's not in
	64	* the AG we are walking - we are walking for writeback, so if it
	65	* passes all the "valid inode" checks and is dirty, then we'll write
	66	* it back anyway. If it has been reallocated and still being
	67	* initialised, the XFS_INEW check below will catch it.
	68	*/
	69	spin_lock(&ip->i_flags_lock);
	70	if (!ip->i_ino)
	71	goto out_unlock_noent;
	72
	73	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
	74	if (__xfs_iflags_test(ip, XFS_INEW \| XFS_IRECLAIMABLE \| XFS_IRECLAIM))
	75	goto out_unlock_noent;
	76	spin_unlock(&ip->i_flags_lock);
	77
	78	/* nothing to sync during shutdown */
	79	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
	80	return EFSCORRUPTED;
	81
	82	/* If we can't grab the inode, it must on it's way to reclaim. */
	83	if (!igrab(inode))
	84	return ENOENT;
	85
	86	if (is_bad_inode(inode)) {
	87	IRELE(ip);
	88	return ENOENT;
	89	}
	90
	91	/* inode is valid */
	92	return 0;
	93
	94	out_unlock_noent:
	95	spin_unlock(&ip->i_flags_lock);
	96	return ENOENT;
	97	}
	98
	99	STATIC int
	100	xfs_inode_ag_walk(
	101	struct xfs_mount *mp,
	102	struct xfs_perag *pag,
	103	int (execute)(struct xfs_inode ip,
	104	struct xfs_perag *pag, int flags),
	105	int flags)
	106	{
	107	uint32_t first_index;
	108	int last_error = 0;
	109	int skipped;
	110	int done;
	111	int nr_found;
	112
	113	restart:
	114	done = 0;
	115	skipped = 0;
	116	first_index = 0;
	117	nr_found = 0;
	118	do {
	119	struct xfs_inode *batch[XFS_LOOKUP_BATCH];
	120	int error = 0;
	121	int i;
	122
	123	rcu_read_lock();
	124	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
	125	(void **)batch, first_index,
	126	XFS_LOOKUP_BATCH);
	127	if (!nr_found) {
	128	rcu_read_unlock();
	129	break;
	130	}
	131
	132	/*
	133	* Grab the inodes before we drop the lock. if we found
	134	* nothing, nr == 0 and the loop will be skipped.
	135	*/
	136	for (i = 0; i < nr_found; i++) {
	137	struct xfs_inode *ip = batch[i];
	138
	139	if (done \|\| xfs_inode_ag_walk_grab(ip))
	140	batch[i] = NULL;
	141
	142	/*
	143	* Update the index for the next lookup. Catch
	144	* overflows into the next AG range which can occur if
	145	* we have inodes in the last block of the AG and we
	146	* are currently pointing to the last inode.
	147	*
	148	* Because we may see inodes that are from the wrong AG
	149	* due to RCU freeing and reallocation, only update the
	150	* index if it lies in this AG. It was a race that lead
	151	* us to see this inode, so another lookup from the
	152	* same index will not find it again.
	153	*/
	154	if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
	155	continue;
	156	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
	157	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
	158	done = 1;
	159	}
	160
	161	/* unlock now we've grabbed the inodes. */
	162	rcu_read_unlock();
	163
	164	for (i = 0; i < nr_found; i++) {
	165	if (!batch[i])
	166	continue;
	167	error = execute(batch[i], pag, flags);
	168	IRELE(batch[i]);
	169	if (error == EAGAIN) {
	170	skipped++;
	171	continue;
	172	}
	173	if (error && last_error != EFSCORRUPTED)
	174	last_error = error;
	175	}
	176
	177	/* bail out if the filesystem is corrupted. */
	178	if (error == EFSCORRUPTED)
	179	break;
	180
	181	cond_resched();
	182
	183	} while (nr_found && !done);
	184
	185	if (skipped) {
	186	delay(1);
	187	goto restart;
	188	}
	189	return last_error;
	190	}
	191
	192	int
	193	xfs_inode_ag_iterator(
	194	struct xfs_mount *mp,
	195	int (execute)(struct xfs_inode ip,
	196	struct xfs_perag *pag, int flags),
	197	int flags)
	198	{
	199	struct xfs_perag *pag;
	200	int error = 0;
	201	int last_error = 0;
	202	xfs_agnumber_t ag;
	203
	204	ag = 0;
	205	while ((pag = xfs_perag_get(mp, ag))) {
	206	ag = pag->pag_agno + 1;
	207	error = xfs_inode_ag_walk(mp, pag, execute, flags);
	208	xfs_perag_put(pag);
	209	if (error) {
	210	last_error = error;
	211	if (error == EFSCORRUPTED)
	212	break;
	213	}
	214	}
	215	return XFS_ERROR(last_error);
	216	}
	217
	218	/*
	219	* Queue a new inode reclaim pass if there are reclaimable inodes and there
	220	* isn't a reclaim pass already in progress. By default it runs every 5s based
	221	* on the xfs periodic sync default of 30s. Perhaps this should have it's own
	222	* tunable, but that can be done if this method proves to be ineffective or too
	223	* aggressive.
	224	*/
	225	static void
	226	xfs_reclaim_work_queue(
	227	struct xfs_mount *mp)
	228	{
	229
	230	rcu_read_lock();
	231	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
	232	queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
	233	msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
	234	}
	235	rcu_read_unlock();
	236	}
	237
	238	/*
	239	* This is a fast pass over the inode cache to try to get reclaim moving on as
	240	* many inodes as possible in a short period of time. It kicks itself every few
	241	* seconds, as well as being kicked by the inode cache shrinker when memory
	242	* goes low. It scans as quickly as possible avoiding locked inodes or those
	243	* already being flushed, and once done schedules a future pass.
	244	*/
	245	void
	246	xfs_reclaim_worker(
	247	struct work_struct *work)
	248	{
	249	struct xfs_mount *mp = container_of(to_delayed_work(work),
	250	struct xfs_mount, m_reclaim_work);
	251
	252	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
	253	xfs_reclaim_work_queue(mp);
	254	}
	255
	256	void
	257	__xfs_inode_set_reclaim_tag(
	258	struct xfs_perag *pag,
	259	struct xfs_inode *ip)
	260	{
	261	radix_tree_tag_set(&pag->pag_ici_root,
	262	XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
	263	XFS_ICI_RECLAIM_TAG);
	264
	265	if (!pag->pag_ici_reclaimable) {
	266	/* propagate the reclaim tag up into the perag radix tree */
	267	spin_lock(&ip->i_mount->m_perag_lock);
	268	radix_tree_tag_set(&ip->i_mount->m_perag_tree,
	269	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
	270	XFS_ICI_RECLAIM_TAG);
	271	spin_unlock(&ip->i_mount->m_perag_lock);
	272
	273	/* schedule periodic background inode reclaim */
	274	xfs_reclaim_work_queue(ip->i_mount);
	275
	276	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
	277	-1, _RET_IP_);
	278	}
	279	pag->pag_ici_reclaimable++;
	280	}
	281
	282	/*
	283	* We set the inode flag atomically with the radix tree tag.
	284	* Once we get tag lookups on the radix tree, this inode flag
	285	* can go away.
	286	*/
	287	void
	288	xfs_inode_set_reclaim_tag(
	289	xfs_inode_t *ip)
	290	{
	291	struct xfs_mount *mp = ip->i_mount;
	292	struct xfs_perag *pag;
	293
	294	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
	295	spin_lock(&pag->pag_ici_lock);
	296	spin_lock(&ip->i_flags_lock);
	297	__xfs_inode_set_reclaim_tag(pag, ip);
	298	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
	299	spin_unlock(&ip->i_flags_lock);
	300	spin_unlock(&pag->pag_ici_lock);
	301	xfs_perag_put(pag);
	302	}
	303
	304	STATIC void
	305	__xfs_inode_clear_reclaim(
	306	xfs_perag_t *pag,
	307	xfs_inode_t *ip)
	308	{
	309	pag->pag_ici_reclaimable--;
	310	if (!pag->pag_ici_reclaimable) {
	311	/* clear the reclaim tag from the perag radix tree */
	312	spin_lock(&ip->i_mount->m_perag_lock);
	313	radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
	314	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
	315	XFS_ICI_RECLAIM_TAG);
	316	spin_unlock(&ip->i_mount->m_perag_lock);
	317	trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
	318	-1, _RET_IP_);
	319	}
	320	}
	321
	322	void
	323	__xfs_inode_clear_reclaim_tag(
	324	xfs_mount_t *mp,
	325	xfs_perag_t *pag,
	326	xfs_inode_t *ip)
	327	{
	328	radix_tree_tag_clear(&pag->pag_ici_root,
	329	XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
	330	__xfs_inode_clear_reclaim(pag, ip);
	331	}
	332
	333	/*
	334	* Grab the inode for reclaim exclusively.
	335	* Return 0 if we grabbed it, non-zero otherwise.
	336	*/
	337	STATIC int
	338	xfs_reclaim_inode_grab(
	339	struct xfs_inode *ip,
	340	int flags)
	341	{
	342	ASSERT(rcu_read_lock_held());
	343
	344	/* quick check for stale RCU freed inode */
	345	if (!ip->i_ino)
	346	return 1;
	347
	348	/*
	349	* If we are asked for non-blocking operation, do unlocked checks to
	350	* see if the inode already is being flushed or in reclaim to avoid
	351	* lock traffic.
	352	*/
	353	if ((flags & SYNC_TRYLOCK) &&
	354	__xfs_iflags_test(ip, XFS_IFLOCK \| XFS_IRECLAIM))
	355	return 1;
	356
	357	/*
	358	* The radix tree lock here protects a thread in xfs_iget from racing
	359	* with us starting reclaim on the inode. Once we have the
	360	* XFS_IRECLAIM flag set it will not touch us.
	361	*
	362	* Due to RCU lookup, we may find inodes that have been freed and only
	363	* have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
	364	* aren't candidates for reclaim at all, so we must check the
	365	* XFS_IRECLAIMABLE is set first before proceeding to reclaim.
	366	*/
	367	spin_lock(&ip->i_flags_lock);
	368	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) \|\|
	369	__xfs_iflags_test(ip, XFS_IRECLAIM)) {
	370	/* not a reclaim candidate. */
	371	spin_unlock(&ip->i_flags_lock);
	372	return 1;
	373	}
	374	__xfs_iflags_set(ip, XFS_IRECLAIM);
	375	spin_unlock(&ip->i_flags_lock);
	376	return 0;
	377	}
	378
	379	/*
	380	* Inodes in different states need to be treated differently. The following
	381	* table lists the inode states and the reclaim actions necessary:
	382	*
	383	* inode state iflush ret required action
	384	* --------------- ---------- ---------------
	385	* bad - reclaim
	386	* shutdown EIO unpin and reclaim
	387	* clean, unpinned 0 reclaim
	388	* stale, unpinned 0 reclaim
	389	* clean, pinned(*) 0 requeue
	390	* stale, pinned EAGAIN requeue
	391	* dirty, async - requeue
	392	* dirty, sync 0 reclaim
	393	*
	394	* (*) dgc: I don't think the clean, pinned state is possible but it gets
	395	* handled anyway given the order of checks implemented.
	396	*
	397	* Also, because we get the flush lock first, we know that any inode that has
	398	* been flushed delwri has had the flush completed by the time we check that
	399	* the inode is clean.
	400	*
	401	* Note that because the inode is flushed delayed write by AIL pushing, the
	402	* flush lock may already be held here and waiting on it can result in very
	403	* long latencies. Hence for sync reclaims, where we wait on the flush lock,
	404	* the caller should push the AIL first before trying to reclaim inodes to
	405	* minimise the amount of time spent waiting. For background relaim, we only
	406	* bother to reclaim clean inodes anyway.
	407	*
	408	* Hence the order of actions after gaining the locks should be:
	409	* bad => reclaim
	410	* shutdown => unpin and reclaim
	411	* pinned, async => requeue
	412	* pinned, sync => unpin
	413	* stale => reclaim
	414	* clean => reclaim
	415	* dirty, async => requeue
	416	* dirty, sync => flush, wait and reclaim
	417	*/
	418	STATIC int
	419	xfs_reclaim_inode(
	420	struct xfs_inode *ip,
	421	struct xfs_perag *pag,
	422	int sync_mode)
	423	{
	424	struct xfs_buf *bp = NULL;
	425	int error;
	426
	427	restart:
	428	error = 0;
	429	xfs_ilock(ip, XFS_ILOCK_EXCL);
	430	if (!xfs_iflock_nowait(ip)) {
	431	if (!(sync_mode & SYNC_WAIT))
	432	goto out;
	433	xfs_iflock(ip);
	434	}
	435
	436	if (is_bad_inode(VFS_I(ip)))
	437	goto reclaim;
	438	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
	439	xfs_iunpin_wait(ip);
	440	xfs_iflush_abort(ip, false);
	441	goto reclaim;
	442	}
	443	if (xfs_ipincount(ip)) {
	444	if (!(sync_mode & SYNC_WAIT))
	445	goto out_ifunlock;
	446	xfs_iunpin_wait(ip);
	447	}
	448	if (xfs_iflags_test(ip, XFS_ISTALE))
	449	goto reclaim;
	450	if (xfs_inode_clean(ip))
	451	goto reclaim;
	452
	453	/*
	454	* Never flush out dirty data during non-blocking reclaim, as it would
	455	* just contend with AIL pushing trying to do the same job.
	456	*/
	457	if (!(sync_mode & SYNC_WAIT))
	458	goto out_ifunlock;
	459
	460	/*
	461	* Now we have an inode that needs flushing.
	462	*
	463	* Note that xfs_iflush will never block on the inode buffer lock, as
	464	* xfs_ifree_cluster() can lock the inode buffer before it locks the
	465	* ip->i_lock, and we are doing the exact opposite here. As a result,
	466	* doing a blocking xfs_imap_to_bp() to get the cluster buffer would
	467	* result in an ABBA deadlock with xfs_ifree_cluster().
	468	*
	469	* As xfs_ifree_cluser() must gather all inodes that are active in the
	470	* cache to mark them stale, if we hit this case we don't actually want
	471	* to do IO here - we want the inode marked stale so we can simply
	472	* reclaim it. Hence if we get an EAGAIN error here, just unlock the
	473	* inode, back off and try again. Hopefully the next pass through will
	474	* see the stale flag set on the inode.
	475	*/
	476	error = xfs_iflush(ip, &bp);
	477	if (error == EAGAIN) {
	478	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	479	/* backoff longer than in xfs_ifree_cluster */
	480	delay(2);
	481	goto restart;
	482	}
	483
	484	if (!error) {
	485	error = xfs_bwrite(bp);
	486	xfs_buf_relse(bp);
	487	}
	488
	489	xfs_iflock(ip);
	490	reclaim:
	491	xfs_ifunlock(ip);
	492	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	493
	494	XFS_STATS_INC(xs_ig_reclaims);
	495	/*
	496	* Remove the inode from the per-AG radix tree.
	497	*
	498	* Because radix_tree_delete won't complain even if the item was never
	499	* added to the tree assert that it's been there before to catch
	500	* problems with the inode life time early on.
	501	*/
	502	spin_lock(&pag->pag_ici_lock);
	503	if (!radix_tree_delete(&pag->pag_ici_root,
	504	XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
	505	ASSERT(0);
	506	__xfs_inode_clear_reclaim(pag, ip);
	507	spin_unlock(&pag->pag_ici_lock);
	508
	509	/*
	510	* Here we do an (almost) spurious inode lock in order to coordinate
	511	* with inode cache radix tree lookups. This is because the lookup
	512	* can reference the inodes in the cache without taking references.
	513	*
	514	* We make that OK here by ensuring that we wait until the inode is
	515	* unlocked after the lookup before we go ahead and free it.
	516	*/
	517	xfs_ilock(ip, XFS_ILOCK_EXCL);
	518	xfs_qm_dqdetach(ip);
	519	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	520
	521	xfs_inode_free(ip);
	522	return error;
	523
	524	out_ifunlock:
	525	xfs_ifunlock(ip);
	526	out:
	527	xfs_iflags_clear(ip, XFS_IRECLAIM);
	528	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	529	/*
	530	* We could return EAGAIN here to make reclaim rescan the inode tree in
	531	* a short while. However, this just burns CPU time scanning the tree
	532	* waiting for IO to complete and the reclaim work never goes back to
	533	* the idle state. Instead, return 0 to let the next scheduled
	534	* background reclaim attempt to reclaim the inode again.
	535	*/
	536	return 0;
	537	}
	538
	539	/*
	540	* Walk the AGs and reclaim the inodes in them. Even if the filesystem is
	541	* corrupted, we still want to try to reclaim all the inodes. If we don't,
	542	* then a shut down during filesystem unmount reclaim walk leak all the
	543	* unreclaimed inodes.
	544	*/
	545	int
	546	xfs_reclaim_inodes_ag(
	547	struct xfs_mount *mp,
	548	int flags,
	549	int *nr_to_scan)
	550	{
	551	struct xfs_perag *pag;
	552	int error = 0;
	553	int last_error = 0;
	554	xfs_agnumber_t ag;
	555	int trylock = flags & SYNC_TRYLOCK;
	556	int skipped;
	557
	558	restart:
	559	ag = 0;
	560	skipped = 0;
	561	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
	562	unsigned long first_index = 0;
	563	int done = 0;
	564	int nr_found = 0;
	565
	566	ag = pag->pag_agno + 1;
	567
	568	if (trylock) {
	569	if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
	570	skipped++;
	571	xfs_perag_put(pag);
	572	continue;
	573	}
	574	first_index = pag->pag_ici_reclaim_cursor;
	575	} else
	576	mutex_lock(&pag->pag_ici_reclaim_lock);
	577
	578	do {
	579	struct xfs_inode *batch[XFS_LOOKUP_BATCH];
	580	int i;
	581
	582	rcu_read_lock();
	583	nr_found = radix_tree_gang_lookup_tag(
	584	&pag->pag_ici_root,
	585	(void **)batch, first_index,
	586	XFS_LOOKUP_BATCH,
	587	XFS_ICI_RECLAIM_TAG);
	588	if (!nr_found) {
	589	done = 1;
	590	rcu_read_unlock();
	591	break;
	592	}
	593
	594	/*
	595	* Grab the inodes before we drop the lock. if we found
	596	* nothing, nr == 0 and the loop will be skipped.
	597	*/
	598	for (i = 0; i < nr_found; i++) {
	599	struct xfs_inode *ip = batch[i];
	600
	601	if (done \|\| xfs_reclaim_inode_grab(ip, flags))
	602	batch[i] = NULL;
	603
	604	/*
	605	* Update the index for the next lookup. Catch
	606	* overflows into the next AG range which can
	607	* occur if we have inodes in the last block of
	608	* the AG and we are currently pointing to the
	609	* last inode.
	610	*
	611	* Because we may see inodes that are from the
	612	* wrong AG due to RCU freeing and
	613	* reallocation, only update the index if it
	614	* lies in this AG. It was a race that lead us
	615	* to see this inode, so another lookup from
	616	* the same index will not find it again.
	617	*/
	618	if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
	619	pag->pag_agno)
	620	continue;
	621	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
	622	if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
	623	done = 1;
	624	}
	625
	626	/* unlock now we've grabbed the inodes. */
	627	rcu_read_unlock();
	628
	629	for (i = 0; i < nr_found; i++) {
	630	if (!batch[i])
	631	continue;
	632	error = xfs_reclaim_inode(batch[i], pag, flags);
	633	if (error && last_error != EFSCORRUPTED)
	634	last_error = error;
	635	}
	636
	637	*nr_to_scan -= XFS_LOOKUP_BATCH;
	638
	639	cond_resched();
	640
	641	} while (nr_found && !done && *nr_to_scan > 0);
	642
	643	if (trylock && !done)
	644	pag->pag_ici_reclaim_cursor = first_index;
	645	else
	646	pag->pag_ici_reclaim_cursor = 0;
	647	mutex_unlock(&pag->pag_ici_reclaim_lock);
	648	xfs_perag_put(pag);
	649	}
	650
	651	/*
	652	* if we skipped any AG, and we still have scan count remaining, do
	653	* another pass this time using blocking reclaim semantics (i.e
	654	* waiting on the reclaim locks and ignoring the reclaim cursors). This
	655	* ensure that when we get more reclaimers than AGs we block rather
	656	* than spin trying to execute reclaim.
	657	*/
	658	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
	659	trylock = 0;
	660	goto restart;
	661	}
	662	return XFS_ERROR(last_error);
	663	}
	664
	665	int
	666	xfs_reclaim_inodes(
	667	xfs_mount_t *mp,
	668	int mode)
	669	{
	670	int nr_to_scan = INT_MAX;
	671
	672	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
	673	}
	674
	675	/*
	676	* Scan a certain number of inodes for reclaim.
	677	*
	678	* When called we make sure that there is a background (fast) inode reclaim in
	679	* progress, while we will throttle the speed of reclaim via doing synchronous
	680	* reclaim of inodes. That means if we come across dirty inodes, we wait for
	681	* them to be cleaned, which we hope will not be very long due to the
	682	* background walker having already kicked the IO off on those dirty inodes.
	683	*/
	684	void
	685	xfs_reclaim_inodes_nr(
	686	struct xfs_mount *mp,
	687	int nr_to_scan)
	688	{
	689	/* kick background reclaimer and push the AIL */
	690	xfs_reclaim_work_queue(mp);
	691	xfs_ail_push_all(mp->m_ail);
	692
	693	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK \| SYNC_WAIT, &nr_to_scan);
	694	}
	695
	696	/*
	697	* Return the number of reclaimable inodes in the filesystem for
	698	* the shrinker to determine how much to reclaim.
	699	*/
	700	int
	701	xfs_reclaim_inodes_count(
	702	struct xfs_mount *mp)
	703	{
	704	struct xfs_perag *pag;
	705	xfs_agnumber_t ag = 0;
	706	int reclaimable = 0;
	707
	708	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
	709	ag = pag->pag_agno + 1;
	710	reclaimable += pag->pag_ici_reclaimable;
	711	xfs_perag_put(pag);
	712	}
	713	return reclaimable;
	714	}
	715