2 files changed, 67 insertions, 3 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index af3275965c77..debe2822c930 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -461,7 +461,6 @@ xfs_sync_worker(
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
@@ -470,6 +469,52 @@ xfs_sync_worker(
 }
 /*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
+{
+        /*
+         * We can have inodes enter reclaim after we've shut down the syncd
+         * workqueue during unmount, so don't allow reclaim work to be queued
+         * during unmount.
+         */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
+                return;
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+        }
+        rcu_read_unlock();
+}
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
@@ -508,7 +553,10 @@ xfs_syncd_init(
 {
        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
        xfs_syncd_queue_sync(mp);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -518,6 +566,7 @@ xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
        cancel_work_sync(&mp->m_flush_work);
 }
@@ -537,6 +586,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -953,7 +1006,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -968,10 +1027,14 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer */
+                xfs_syncd_queue_reclaim(mp);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a0ad90e95299..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -204,6 +204,7 @@ typedef struct xfs_mount {
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct delayed_work     m_sync_work;    /* background sync work */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct work_struct      m_flush_work;   /* background inode flush */
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index af3275965c77..debe2822c930 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -461,7 +461,6 @@ xfs_sync_worker(
461	error = xfs_fs_log_dummy(mp);	461	error = xfs_fs_log_dummy(mp);
462	else	462	else
463	xfs_log_force(mp, 0);	463	xfs_log_force(mp, 0);
464	xfs_reclaim_inodes(mp, 0);
465	error = xfs_qm_sync(mp, SYNC_TRYLOCK);	464	error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466	}	465	}
467		466
@@ -470,6 +469,52 @@ xfs_sync_worker(
470	}	469	}
471		470
472	/*	471	/*
		472	* Queue a new inode reclaim pass if there are reclaimable inodes and there
		473	* isn't a reclaim pass already in progress. By default it runs every 5s based
		474	* on the xfs syncd work default of 30s. Perhaps this should have it's own
		475	* tunable, but that can be done if this method proves to be ineffective or too
		476	* aggressive.
		477	*/
		478	static void
		479	xfs_syncd_queue_reclaim(
		480	struct xfs_mount *mp)
		481	{
		482
		483	/*
		484	* We can have inodes enter reclaim after we've shut down the syncd
		485	* workqueue during unmount, so don't allow reclaim work to be queued
		486	* during unmount.
		487	*/
		488	if (!(mp->m_super->s_flags & MS_ACTIVE))
		489	return;
		490
		491	rcu_read_lock();
		492	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
		493	queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
		494	msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
		495	}
		496	rcu_read_unlock();
		497	}
		498
		499	/*
		500	* This is a fast pass over the inode cache to try to get reclaim moving on as
		501	* many inodes as possible in a short period of time. It kicks itself every few
		502	* seconds, as well as being kicked by the inode cache shrinker when memory
		503	* goes low. It scans as quickly as possible avoiding locked inodes or those
		504	* already being flushed, and once done schedules a future pass.
		505	*/
		506	STATIC void
		507	xfs_reclaim_worker(
		508	struct work_struct *work)
		509	{
		510	struct xfs_mount *mp = container_of(to_delayed_work(work),
		511	struct xfs_mount, m_reclaim_work);
		512
		513	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
		514	xfs_syncd_queue_reclaim(mp);
		515	}
		516
		517	/*
473	* Flush delayed allocate data, attempting to free up reserved space	518	* Flush delayed allocate data, attempting to free up reserved space
474	* from existing allocations. At this point a new allocation attempt	519	* from existing allocations. At this point a new allocation attempt
475	* has failed with ENOSPC and we are in the process of scratching our	520	* has failed with ENOSPC and we are in the process of scratching our
@@ -508,7 +553,10 @@ xfs_syncd_init(
508	{	553	{
509	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);	554	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
510	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);	555	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
		556	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
		557
511	xfs_syncd_queue_sync(mp);	558	xfs_syncd_queue_sync(mp);
		559	xfs_syncd_queue_reclaim(mp);
512		560
513	return 0;	561	return 0;
514	}	562	}
@@ -518,6 +566,7 @@ xfs_syncd_stop(
518	struct xfs_mount *mp)	566	struct xfs_mount *mp)
519	{	567	{
520	cancel_delayed_work_sync(&mp->m_sync_work);	568	cancel_delayed_work_sync(&mp->m_sync_work);
		569	cancel_delayed_work_sync(&mp->m_reclaim_work);
521	cancel_work_sync(&mp->m_flush_work);	570	cancel_work_sync(&mp->m_flush_work);
522	}	571	}
523		572
@@ -537,6 +586,10 @@ __xfs_inode_set_reclaim_tag(
537	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),	586	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
538	XFS_ICI_RECLAIM_TAG);	587	XFS_ICI_RECLAIM_TAG);
539	spin_unlock(&ip->i_mount->m_perag_lock);	588	spin_unlock(&ip->i_mount->m_perag_lock);
		589
		590	/* schedule periodic background inode reclaim */
		591	xfs_syncd_queue_reclaim(ip->i_mount);
		592
540	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,	593	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
541	-1, _RET_IP_);	594	-1, _RET_IP_);
542	}	595	}
@@ -953,7 +1006,13 @@ xfs_reclaim_inodes(
953	}	1006	}
954		1007
955	/*	1008	/*
956	* Shrinker infrastructure.	1009	* Inode cache shrinker.
		1010	*
		1011	* When called we make sure that there is a background (fast) inode reclaim in
		1012	* progress, while we will throttle the speed of reclaim via doiing synchronous
		1013	* reclaim of inodes. That means if we come across dirty inodes, we wait for
		1014	* them to be cleaned, which we hope will not be very long due to the
		1015	* background walker having already kicked the IO off on those dirty inodes.
957	*/	1016	*/
958	static int	1017	static int
959	xfs_reclaim_inode_shrink(	1018	xfs_reclaim_inode_shrink(
@@ -968,10 +1027,14 @@ xfs_reclaim_inode_shrink(
968		1027
969	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);	1028	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
970	if (nr_to_scan) {	1029	if (nr_to_scan) {
		1030	/* kick background reclaimer */
		1031	xfs_syncd_queue_reclaim(mp);
		1032
971	if (!(gfp_mask & __GFP_FS))	1033	if (!(gfp_mask & __GFP_FS))
972	return -1;	1034	return -1;
973		1035
974	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);	1036	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK \| SYNC_WAIT,
		1037	&nr_to_scan);
975	/* terminate if we don't exhaust the scan */	1038	/* terminate if we don't exhaust the scan */
976	if (nr_to_scan > 0)	1039	if (nr_to_scan > 0)
977	return -1;	1040	return -1;


diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a0ad90e95299..19af0ab0d0c6 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h
@@ -204,6 +204,7 @@ typedef struct xfs_mount {
204	#endif	204	#endif
205	struct xfs_mru_cache m_filestream; / per-mount filestream data */	205	struct xfs_mru_cache m_filestream; / per-mount filestream data */
206	struct delayed_work m_sync_work; /* background sync work */	206	struct delayed_work m_sync_work; /* background sync work */
		207	struct delayed_work m_reclaim_work; /* background inode reclaim */
207	struct work_struct m_flush_work; /* background inode flush */	208	struct work_struct m_flush_work; /* background inode flush */
208	__int64_t m_update_flags; /* sb flags we need to update	209	__int64_t m_update_flags; /* sb flags we need to update
209	on the next remount,rw */	210	on the next remount,rw */