1 files changed, 146 insertions, 119 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
+#include "xfs_trans_priv.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
@@ -39,6 +40,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
 /*
 * Second stage of a quiesce. The data is already synced, now we have to take
 * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
+ * wait for any remaining transactions to drain out before proceeding.
 */
 void
 xfs_quiesce_attr(
@@ -425,69 +428,18 @@ xfs_quiesce_attr(
        /* Push the superblock and write an unmount record */
        error = xfs_log_sbcount(mp, 1);
        if (error)
-                xfs_fs_cmn_err(CE_WARN, mp,
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "xfs_attr_quiesce: failed to log sb changes. "
                                "Frozen image may not be consistent.");
        xfs_log_unmount_write(mp);
        xfs_unmountfs_writesb(mp);
 }
-/*
+static void
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+xfs_syncd_queue_sync(
- * Doing this has two advantages:
+        struct xfs_mount        *mp)
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-        struct xfs_mount *mp,
-        void            *data,
-        void            (*syncer)(struct xfs_mount *, void *),
-        struct completion *completion)
-{
-        struct xfs_sync_work *work;
-        work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
-        INIT_LIST_HEAD(&work->w_list);
-        work->w_syncer = syncer;
-        work->w_data = data;
-        work->w_mount = mp;
-        work->w_completion = completion;
-        spin_lock(&mp->m_sync_lock);
-        list_add_tail(&work->w_list, &mp->m_sync_list);
-        spin_unlock(&mp->m_sync_lock);
-        wake_up_process(mp->m_sync_task);
-}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
-        struct xfs_mount *mp,
-        void            *arg)
-{
-        struct inode    *inode = arg;
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-        iput(inode);
-}
-void
-xfs_flush_inodes(
-        xfs_inode_t     *ip)
 {
-        struct inode    *inode = VFS_I(ip);
+        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-        DECLARE_COMPLETION_ONSTACK(completion);
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-        igrab(inode);
-        xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
-        wait_for_completion(&completion);
-        xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
 }
 /*
@@ -497,9 +449,10 @@ xfs_flush_inodes(
 */
 STATIC void
 xfs_sync_worker(
-        struct xfs_mount *mp,
+        struct work_struct *work)
-        void            *unused)
 {
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_sync_work);
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -509,73 +462,106 @@ xfs_sync_worker(
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+                /* start pushing all the metadata that is currently dirty */
+                xfs_ail_push_all(mp->m_ail);
        }
-        mp->m_sync_seq++;
-        wake_up(&mp->m_wait_single_sync_task);
+        /* queue us up again */
+        xfs_syncd_queue_sync(mp);
 }
-STATIC int
+/*
-xfssyncd(
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
-        void                    *arg)
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
 {
-        struct xfs_mount        *mp = arg;
-        long                    timeleft;
-        xfs_sync_work_t         *work, *n;
-        LIST_HEAD               (tmp);
-        set_freezable();
-        timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-        for (;;) {
-                if (list_empty(&mp->m_sync_list))
-                        timeleft = schedule_timeout_interruptible(timeleft);
-                /* swsusp */
-                try_to_freeze();
-                if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                        break;
-                spin_lock(&mp->m_sync_lock);
+        /*
-                /*
+         * We can have inodes enter reclaim after we've shut down the syncd
-                 * We can get woken by laptop mode, to do a sync -
+         * workqueue during unmount, so don't allow reclaim work to be queued
-                 * that's the (only!) case where the list would be
+         * during unmount.
-                 * empty with time remaining.
+         */
-                 */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
-                if (!timeleft || list_empty(&mp->m_sync_list)) {
+                return;
-                        if (!timeleft)
-                                timeleft = xfs_syncd_centisecs *
-                                                        msecs_to_jiffies(10);
-                        INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                        list_add_tail(&mp->m_sync_work.w_list,
-                                        &mp->m_sync_list);
-                }
-                list_splice_init(&mp->m_sync_list, &tmp);
-                spin_unlock(&mp->m_sync_lock);
-                list_for_each_entry_safe(work, n, &tmp, w_list) {
+        rcu_read_lock();
-                        (*work->w_syncer)(mp, work->w_data);
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                        list_del(&work->w_list);
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                        if (work == &mp->m_sync_work)
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-                                continue;
-                        if (work->w_completion)
-                                complete(work->w_completion);
-                        kmem_free(work);
-                }
        }
+        rcu_read_unlock();
+}
-        return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        queue_work(xfs_syncd_wq, &mp->m_flush_work);
+        flush_work_sync(&mp->m_flush_work);
+}
+STATIC void
+xfs_flush_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(work,
+                                        struct xfs_mount, m_flush_work);
+        xfs_sync_data(mp, SYNC_TRYLOCK);
+        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 int
 xfs_syncd_init(
        struct xfs_mount        *mp)
 {
-        mp->m_sync_work.w_syncer = xfs_sync_worker;
+        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        mp->m_sync_work.w_mount = mp;
+        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        mp->m_sync_work.w_completion = NULL;
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
-        if (IS_ERR(mp->m_sync_task))
+        xfs_syncd_queue_sync(mp);
-                return -PTR_ERR(mp->m_sync_task);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -583,7 +569,9 @@ void
 xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
-        kthread_stop(mp->m_sync_task);
+        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
+        cancel_work_sync(&mp->m_flush_work);
 }
 void
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -762,8 +754,10 @@ xfs_reclaim_inode(
        struct xfs_perag        *pag,
        int                     sync_mode)
 {
-        int     error = 0;
+        int     error;
+restart:
+        error = 0;
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        if (!xfs_iflock_nowait(ip)) {
                if (!(sync_mode & SYNC_WAIT))
@@ -789,9 +783,31 @@ xfs_reclaim_inode(
        if (xfs_inode_clean(ip))
                goto reclaim;
-        /* Now we have an inode that needs flushing */
+        /*
-        error = xfs_iflush(ip, sync_mode);
+         * Now we have an inode that needs flushing.
+         *
+         * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+         * reclaim as we can deadlock with inode cluster removal.
+         * xfs_ifree_cluster() can lock the inode buffer before it locks the
+         * ip->i_lock, and we are doing the exact opposite here. As a result,
+         * doing a blocking xfs_itobp() to get the cluster buffer will result
+         * in an ABBA deadlock with xfs_ifree_cluster().
+         *
+         * As xfs_ifree_cluser() must gather all inodes that are active in the
+         * cache to mark them stale, if we hit this case we don't actually want
+         * to do IO here - we want the inode marked stale so we can simply
+         * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+         * just unlock the inode, back off and try again. Hopefully the next
+         * pass through will see the stale flag set on the inode.
+         */
+        error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
        if (sync_mode & SYNC_WAIT) {
+                if (error == EAGAIN) {
+                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                        /* backoff longer than in xfs_ifree_cluster */
+                        delay(2);
+                        goto restart;
+                }
                xfs_iflock(ip);
                goto reclaim;
        }
@@ -806,7 +822,7 @@ xfs_reclaim_inode(
         * pass on the error.
         */
        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
+                xfs_warn(ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
        }
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer and push the AIL */
+                xfs_syncd_queue_reclaim(mp);
+                xfs_ail_push_all(mp->m_ail);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index e22f0057d21f..e4f9c1b0836c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22	#include "xfs_log.h"	22	#include "xfs_log.h"
23	#include "xfs_inum.h"	23	#include "xfs_inum.h"
24	#include "xfs_trans.h"	24	#include "xfs_trans.h"
		25	#include "xfs_trans_priv.h"
25	#include "xfs_sb.h"	26	#include "xfs_sb.h"
26	#include "xfs_ag.h"	27	#include "xfs_ag.h"
27	#include "xfs_mount.h"	28	#include "xfs_mount.h"
@@ -39,6 +40,8 @@
39	#include <linux/kthread.h>	40	#include <linux/kthread.h>
40	#include <linux/freezer.h>	41	#include <linux/freezer.h>
41		42
		43	struct workqueue_struct xfs_syncd_wq; / sync workqueue */
		44
42	/*	45	/*
43	* The inode lookup is done in batches to keep the amount of lock traffic and	46	* The inode lookup is done in batches to keep the amount of lock traffic and
44	* radix tree lookups to a minimum. The batch size is a trade off between	47	* radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
401	/*	404	/*
402	* Second stage of a quiesce. The data is already synced, now we have to take	405	* Second stage of a quiesce. The data is already synced, now we have to take
403	* care of the metadata. New transactions are already blocked, so we need to	406	* care of the metadata. New transactions are already blocked, so we need to
404	* wait for any remaining transactions to drain out before proceding.	407	* wait for any remaining transactions to drain out before proceeding.
405	*/	408	*/
406	void	409	void
407	xfs_quiesce_attr(	410	xfs_quiesce_attr(
@@ -425,69 +428,18 @@ xfs_quiesce_attr(
425	/* Push the superblock and write an unmount record */	428	/* Push the superblock and write an unmount record */
426	error = xfs_log_sbcount(mp, 1);	429	error = xfs_log_sbcount(mp, 1);
427	if (error)	430	if (error)
428	xfs_fs_cmn_err(CE_WARN, mp,	431	xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
429	"xfs_attr_quiesce: failed to log sb changes. "
430	"Frozen image may not be consistent.");	432	"Frozen image may not be consistent.");
431	xfs_log_unmount_write(mp);	433	xfs_log_unmount_write(mp);
432	xfs_unmountfs_writesb(mp);	434	xfs_unmountfs_writesb(mp);
433	}	435	}
434		436
435	/*	437	static void
436	* Enqueue a work item to be picked up by the vfs xfssyncd thread.	438	xfs_syncd_queue_sync(
437	* Doing this has two advantages:	439	struct xfs_mount *mp)
438	* - It saves on stack space, which is tight in certain situations
439	* - It can be used (with care) as a mechanism to avoid deadlocks.
440	* Flushing while allocating in a full filesystem requires both.
441	*/
442	STATIC void
443	xfs_syncd_queue_work(
444	struct xfs_mount *mp,
445	void *data,
446	void (syncer)(struct xfs_mount , void *),
447	struct completion *completion)
448	{
449	struct xfs_sync_work *work;
450
451	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
452	INIT_LIST_HEAD(&work->w_list);
453	work->w_syncer = syncer;
454	work->w_data = data;
455	work->w_mount = mp;
456	work->w_completion = completion;
457	spin_lock(&mp->m_sync_lock);
458	list_add_tail(&work->w_list, &mp->m_sync_list);
459	spin_unlock(&mp->m_sync_lock);
460	wake_up_process(mp->m_sync_task);
461	}
462
463	/*
464	* Flush delayed allocate data, attempting to free up reserved space
465	* from existing allocations. At this point a new allocation attempt
466	* has failed with ENOSPC and we are in the process of scratching our
467	* heads, looking about for more room...
468	*/
469	STATIC void
470	xfs_flush_inodes_work(
471	struct xfs_mount *mp,
472	void *arg)
473	{
474	struct inode *inode = arg;
475	xfs_sync_data(mp, SYNC_TRYLOCK);
476	xfs_sync_data(mp, SYNC_TRYLOCK \| SYNC_WAIT);
477	iput(inode);
478	}
479
480	void
481	xfs_flush_inodes(
482	xfs_inode_t *ip)
483	{	440	{
484	struct inode *inode = VFS_I(ip);	441	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
485	DECLARE_COMPLETION_ONSTACK(completion);	442	msecs_to_jiffies(xfs_syncd_centisecs * 10));
486
487	igrab(inode);
488	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
489	wait_for_completion(&completion);
490	xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
491	}	443	}
492		444
493	/*	445	/*
@@ -497,9 +449,10 @@ xfs_flush_inodes(
497	*/	449	*/
498	STATIC void	450	STATIC void
499	xfs_sync_worker(	451	xfs_sync_worker(
500	struct xfs_mount *mp,	452	struct work_struct *work)
501	void *unused)
502	{	453	{
		454	struct xfs_mount *mp = container_of(to_delayed_work(work),
		455	struct xfs_mount, m_sync_work);
503	int error;	456	int error;
504		457
505	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {	458	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -509,73 +462,106 @@ xfs_sync_worker(
509	error = xfs_fs_log_dummy(mp);	462	error = xfs_fs_log_dummy(mp);
510	else	463	else
511	xfs_log_force(mp, 0);	464	xfs_log_force(mp, 0);
512	xfs_reclaim_inodes(mp, 0);
513	error = xfs_qm_sync(mp, SYNC_TRYLOCK);	465	error = xfs_qm_sync(mp, SYNC_TRYLOCK);
		466
		467	/* start pushing all the metadata that is currently dirty */
		468	xfs_ail_push_all(mp->m_ail);
514	}	469	}
515	mp->m_sync_seq++;	470
516	wake_up(&mp->m_wait_single_sync_task);	471	/* queue us up again */
		472	xfs_syncd_queue_sync(mp);
517	}	473	}
518		474
519	STATIC int	475	/*
520	xfssyncd(	476	* Queue a new inode reclaim pass if there are reclaimable inodes and there
521	void *arg)	477	* isn't a reclaim pass already in progress. By default it runs every 5s based
		478	* on the xfs syncd work default of 30s. Perhaps this should have it's own
		479	* tunable, but that can be done if this method proves to be ineffective or too
		480	* aggressive.
		481	*/
		482	static void
		483	xfs_syncd_queue_reclaim(
		484	struct xfs_mount *mp)
522	{	485	{
523	struct xfs_mount *mp = arg;
524	long timeleft;
525	xfs_sync_work_t work, n;
526	LIST_HEAD (tmp);
527
528	set_freezable();
529	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
530	for (;;) {
531	if (list_empty(&mp->m_sync_list))
532	timeleft = schedule_timeout_interruptible(timeleft);
533	/* swsusp */
534	try_to_freeze();
535	if (kthread_should_stop() && list_empty(&mp->m_sync_list))
536	break;
537		486
538	spin_lock(&mp->m_sync_lock);	487	/*
539	/*	488	* We can have inodes enter reclaim after we've shut down the syncd
540	* We can get woken by laptop mode, to do a sync -	489	* workqueue during unmount, so don't allow reclaim work to be queued
541	* that's the (only!) case where the list would be	490	* during unmount.
542	* empty with time remaining.	491	*/
543	*/	492	if (!(mp->m_super->s_flags & MS_ACTIVE))
544	if (!timeleft \|\| list_empty(&mp->m_sync_list)) {	493	return;
545	if (!timeleft)
546	timeleft = xfs_syncd_centisecs *
547	msecs_to_jiffies(10);
548	INIT_LIST_HEAD(&mp->m_sync_work.w_list);
549	list_add_tail(&mp->m_sync_work.w_list,
550	&mp->m_sync_list);
551	}
552	list_splice_init(&mp->m_sync_list, &tmp);
553	spin_unlock(&mp->m_sync_lock);
554		494
555	list_for_each_entry_safe(work, n, &tmp, w_list) {	495	rcu_read_lock();
556	(*work->w_syncer)(mp, work->w_data);	496	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
557	list_del(&work->w_list);	497	queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
558	if (work == &mp->m_sync_work)	498	msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
559	continue;
560	if (work->w_completion)
561	complete(work->w_completion);
562	kmem_free(work);
563	}
564	}	499	}
		500	rcu_read_unlock();
		501	}
565		502
566	return 0;	503	/*
		504	* This is a fast pass over the inode cache to try to get reclaim moving on as
		505	* many inodes as possible in a short period of time. It kicks itself every few
		506	* seconds, as well as being kicked by the inode cache shrinker when memory
		507	* goes low. It scans as quickly as possible avoiding locked inodes or those
		508	* already being flushed, and once done schedules a future pass.
		509	*/
		510	STATIC void
		511	xfs_reclaim_worker(
		512	struct work_struct *work)
		513	{
		514	struct xfs_mount *mp = container_of(to_delayed_work(work),
		515	struct xfs_mount, m_reclaim_work);
		516
		517	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
		518	xfs_syncd_queue_reclaim(mp);
		519	}
		520
		521	/*
		522	* Flush delayed allocate data, attempting to free up reserved space
		523	* from existing allocations. At this point a new allocation attempt
		524	* has failed with ENOSPC and we are in the process of scratching our
		525	* heads, looking about for more room.
		526	*
		527	* Queue a new data flush if there isn't one already in progress and
		528	* wait for completion of the flush. This means that we only ever have one
		529	* inode flush in progress no matter how many ENOSPC events are occurring and
		530	* so will prevent the system from bogging down due to every concurrent
		531	* ENOSPC event scanning all the active inodes in the system for writeback.
		532	*/
		533	void
		534	xfs_flush_inodes(
		535	struct xfs_inode *ip)
		536	{
		537	struct xfs_mount *mp = ip->i_mount;
		538
		539	queue_work(xfs_syncd_wq, &mp->m_flush_work);
		540	flush_work_sync(&mp->m_flush_work);
		541	}
		542
		543	STATIC void
		544	xfs_flush_worker(
		545	struct work_struct *work)
		546	{
		547	struct xfs_mount *mp = container_of(work,
		548	struct xfs_mount, m_flush_work);
		549
		550	xfs_sync_data(mp, SYNC_TRYLOCK);
		551	xfs_sync_data(mp, SYNC_TRYLOCK \| SYNC_WAIT);
567	}	552	}
568		553
569	int	554	int
570	xfs_syncd_init(	555	xfs_syncd_init(
571	struct xfs_mount *mp)	556	struct xfs_mount *mp)
572	{	557	{
573	mp->m_sync_work.w_syncer = xfs_sync_worker;	558	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
574	mp->m_sync_work.w_mount = mp;	559	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
575	mp->m_sync_work.w_completion = NULL;	560	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
576	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);	561
577	if (IS_ERR(mp->m_sync_task))	562	xfs_syncd_queue_sync(mp);
578	return -PTR_ERR(mp->m_sync_task);	563	xfs_syncd_queue_reclaim(mp);
		564
579	return 0;	565	return 0;
580	}	566	}
581		567
@@ -583,7 +569,9 @@ void
583	xfs_syncd_stop(	569	xfs_syncd_stop(
584	struct xfs_mount *mp)	570	struct xfs_mount *mp)
585	{	571	{
586	kthread_stop(mp->m_sync_task);	572	cancel_delayed_work_sync(&mp->m_sync_work);
		573	cancel_delayed_work_sync(&mp->m_reclaim_work);
		574	cancel_work_sync(&mp->m_flush_work);
587	}	575	}
588		576
589	void	577	void
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
602	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),	590	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
603	XFS_ICI_RECLAIM_TAG);	591	XFS_ICI_RECLAIM_TAG);
604	spin_unlock(&ip->i_mount->m_perag_lock);	592	spin_unlock(&ip->i_mount->m_perag_lock);
		593
		594	/* schedule periodic background inode reclaim */
		595	xfs_syncd_queue_reclaim(ip->i_mount);
		596
605	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,	597	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
606	-1, _RET_IP_);	598	-1, _RET_IP_);
607	}	599	}
@@ -762,8 +754,10 @@ xfs_reclaim_inode(
762	struct xfs_perag *pag,	754	struct xfs_perag *pag,
763	int sync_mode)	755	int sync_mode)
764	{	756	{
765	int error = 0;	757	int error;
766		758
		759	restart:
		760	error = 0;
767	xfs_ilock(ip, XFS_ILOCK_EXCL);	761	xfs_ilock(ip, XFS_ILOCK_EXCL);
768	if (!xfs_iflock_nowait(ip)) {	762	if (!xfs_iflock_nowait(ip)) {
769	if (!(sync_mode & SYNC_WAIT))	763	if (!(sync_mode & SYNC_WAIT))
@@ -789,9 +783,31 @@ xfs_reclaim_inode(
789	if (xfs_inode_clean(ip))	783	if (xfs_inode_clean(ip))
790	goto reclaim;	784	goto reclaim;
791		785
792	/* Now we have an inode that needs flushing */	786	/*
793	error = xfs_iflush(ip, sync_mode);	787	* Now we have an inode that needs flushing.
		788	*
		789	* We do a nonblocking flush here even if we are doing a SYNC_WAIT
		790	* reclaim as we can deadlock with inode cluster removal.
		791	* xfs_ifree_cluster() can lock the inode buffer before it locks the
		792	* ip->i_lock, and we are doing the exact opposite here. As a result,
		793	* doing a blocking xfs_itobp() to get the cluster buffer will result
		794	* in an ABBA deadlock with xfs_ifree_cluster().
		795	*
		796	* As xfs_ifree_cluser() must gather all inodes that are active in the
		797	* cache to mark them stale, if we hit this case we don't actually want
		798	* to do IO here - we want the inode marked stale so we can simply
		799	* reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
		800	* just unlock the inode, back off and try again. Hopefully the next
		801	* pass through will see the stale flag set on the inode.
		802	*/
		803	error = xfs_iflush(ip, SYNC_TRYLOCK \| sync_mode);
794	if (sync_mode & SYNC_WAIT) {	804	if (sync_mode & SYNC_WAIT) {
		805	if (error == EAGAIN) {
		806	xfs_iunlock(ip, XFS_ILOCK_EXCL);
		807	/* backoff longer than in xfs_ifree_cluster */
		808	delay(2);
		809	goto restart;
		810	}
795	xfs_iflock(ip);	811	xfs_iflock(ip);
796	goto reclaim;	812	goto reclaim;
797	}	813	}
@@ -806,7 +822,7 @@ xfs_reclaim_inode(
806	* pass on the error.	822	* pass on the error.
807	*/	823	*/
808	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {	824	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
809	xfs_fs_cmn_err(CE_WARN, ip->i_mount,	825	xfs_warn(ip->i_mount,
810	"inode 0x%llx background reclaim flush failed with %d",	826	"inode 0x%llx background reclaim flush failed with %d",
811	(long long)ip->i_ino, error);	827	(long long)ip->i_ino, error);
812	}	828	}
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes(
994	}	1010	}
995		1011
996	/*	1012	/*
997	* Shrinker infrastructure.	1013	* Inode cache shrinker.
		1014	*
		1015	* When called we make sure that there is a background (fast) inode reclaim in
		1016	* progress, while we will throttle the speed of reclaim via doiing synchronous
		1017	* reclaim of inodes. That means if we come across dirty inodes, we wait for
		1018	* them to be cleaned, which we hope will not be very long due to the
		1019	* background walker having already kicked the IO off on those dirty inodes.
998	*/	1020	*/
999	static int	1021	static int
1000	xfs_reclaim_inode_shrink(	1022	xfs_reclaim_inode_shrink(
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink(
1009		1031
1010	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);	1032	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1011	if (nr_to_scan) {	1033	if (nr_to_scan) {
		1034	/* kick background reclaimer and push the AIL */
		1035	xfs_syncd_queue_reclaim(mp);
		1036	xfs_ail_push_all(mp->m_ail);
		1037
1012	if (!(gfp_mask & __GFP_FS))	1038	if (!(gfp_mask & __GFP_FS))
1013	return -1;	1039	return -1;
1014		1040
1015	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);	1041	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK \| SYNC_WAIT,
		1042	&nr_to_scan);
1016	/* terminate if we don't exhaust the scan */	1043	/* terminate if we don't exhaust the scan */
1017	if (nr_to_scan > 0)	1044	if (nr_to_scan > 0)
1018	return -1;	1045	return -1;