xfs: introduce background inode reclaim work

Background inode reclaim needs to run more frequently that the XFS syncd work is run as 30s is too long between optimal reclaim runs. Add a new periodic work item to the xfs syncd workqueue to run a fast, non-blocking inode reclaim scan. Background inode reclaim is kicked by the act of marking inodes for reclaim. When an AG is first marked as having reclaimable inodes, the background reclaim work is kicked. It will continue to run periodically untill it detects that there are no more reclaimable inodes. It will be kicked again when the first inode is queued for reclaim. To ensure shrinker based inode reclaim throttles to the inode cleaning and reclaim rate but still reclaim inodes efficiently, make it kick the background inode reclaim so that when we are low on memory we are trying to reclaim inodes as efficiently as possible. This kick shoul d not be necessary, but it will protect against failures to kick the background reclaim when inodes are first dirtied. To provide the rate throttling, make the shrinker pass do synchronous inode reclaim so that it blocks on inodes under IO. This means that the shrinker will reclaim inodes rather than just skipping over them, but it does not adversely affect the rate of reclaim because most dirty inodes are already under IO due to the background reclaim work the shrinker kicked. These two modifications solve one of the two OOM killer invocations Chris Mason reported recently when running a stress testing script. The particular workload trigger for the OOM killer invocation is where there are more threads than CPUs all unlinking files in an extremely memory constrained environment. Unlike other solutions, this one does not have a performance impact on performance when memory is not constrained or the number of concurrent threads operating is <= to the number of CPUs. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Alex Elder <aelder@sgi.com>
author: Dave Chinner <dchinner@redhat.com> 2011-04-07 22:45:07 -0400
committer: Dave Chinner <david@fromorbit.com> 2011-04-07 22:45:07 -0400
commit: a7b339f1b8698667eada006e717cdb4523be2ed5 (patch)
tree: 77c44400c32284bdcf15829e10d01eb15ddd1d41 /fs/xfs
parent: 89e4cb550a492cfca038a555fcc1bdac58822ec3 (diff)
2 files changed, 67 insertions, 3 deletions
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index af3275965c7..debe2822c93 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -461,7 +461,6 @@ xfs_sync_worker(
                        error = xfs_fs_log_dummy(mp);
                else
                        xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
@@ -470,6 +469,52 @@ xfs_sync_worker(
 }
 /*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+        struct xfs_mount        *mp)
+{
+        /*
+         * We can have inodes enter reclaim after we've shut down the syncd
+         * workqueue during unmount, so don't allow reclaim work to be queued
+         * during unmount.
+         */
+        if (!(mp->m_super->s_flags & MS_ACTIVE))
+                return;
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+        }
+        rcu_read_unlock();
+}
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                        struct xfs_mount, m_reclaim_work);
+        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+        xfs_syncd_queue_reclaim(mp);
+}
+/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
@@ -508,7 +553,10 @@ xfs_syncd_init(
 {
        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
        xfs_syncd_queue_sync(mp);
+        xfs_syncd_queue_reclaim(mp);
        return 0;
 }
@@ -518,6 +566,7 @@ xfs_syncd_stop(
        struct xfs_mount        *mp)
 {
        cancel_delayed_work_sync(&mp->m_sync_work);
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
        cancel_work_sync(&mp->m_flush_work);
 }
@@ -537,6 +586,10 @@ __xfs_inode_set_reclaim_tag(
                                XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
                                XFS_ICI_RECLAIM_TAG);
                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* schedule periodic background inode reclaim */
+                xfs_syncd_queue_reclaim(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
        }
@@ -953,7 +1006,13 @@ xfs_reclaim_inodes(
 }
 /*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
 */
 static int
 xfs_reclaim_inode_shrink(
@@ -968,10 +1027,14 @@ xfs_reclaim_inode_shrink(
        mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
        if (nr_to_scan) {
+                /* kick background reclaimer */
+                xfs_syncd_queue_reclaim(mp);
                if (!(gfp_mask & __GFP_FS))
                        return -1;
-                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+                xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+                                        &nr_to_scan);
                /* terminate if we don't exhaust the scan */
                if (nr_to_scan > 0)
                        return -1;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a0ad90e9529..19af0ab0d0c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -204,6 +204,7 @@ typedef struct xfs_mount {
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
        struct delayed_work     m_sync_work;    /* background sync work */
+        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct work_struct      m_flush_work;   /* background inode flush */
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
author	Dave Chinner <dchinner@redhat.com>	2011-04-07 22:45:07 -0400
committer	Dave Chinner <david@fromorbit.com>	2011-04-07 22:45:07 -0400
commit	a7b339f1b8698667eada006e717cdb4523be2ed5 (patch)
tree	77c44400c32284bdcf15829e10d01eb15ddd1d41 /fs/xfs
parent	89e4cb550a492cfca038a555fcc1bdac58822ec3 (diff)

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index af3275965c7..debe2822c93 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -461,7 +461,6 @@ xfs_sync_worker(
461	error = xfs_fs_log_dummy(mp);	461	error = xfs_fs_log_dummy(mp);
462	else	462	else
463	xfs_log_force(mp, 0);	463	xfs_log_force(mp, 0);
464	xfs_reclaim_inodes(mp, 0);
465	error = xfs_qm_sync(mp, SYNC_TRYLOCK);	464	error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466	}	465	}
467		466
@@ -470,6 +469,52 @@ xfs_sync_worker(
470	}	469	}
471		470
472	/*	471	/*
		472	* Queue a new inode reclaim pass if there are reclaimable inodes and there
		473	* isn't a reclaim pass already in progress. By default it runs every 5s based
		474	* on the xfs syncd work default of 30s. Perhaps this should have it's own
		475	* tunable, but that can be done if this method proves to be ineffective or too
		476	* aggressive.
		477	*/
		478	static void
		479	xfs_syncd_queue_reclaim(
		480	struct xfs_mount *mp)
		481	{
		482
		483	/*
		484	* We can have inodes enter reclaim after we've shut down the syncd
		485	* workqueue during unmount, so don't allow reclaim work to be queued
		486	* during unmount.
		487	*/
		488	if (!(mp->m_super->s_flags & MS_ACTIVE))
		489	return;
		490
		491	rcu_read_lock();
		492	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
		493	queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
		494	msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
		495	}
		496	rcu_read_unlock();
		497	}
		498
		499	/*
		500	* This is a fast pass over the inode cache to try to get reclaim moving on as
		501	* many inodes as possible in a short period of time. It kicks itself every few
		502	* seconds, as well as being kicked by the inode cache shrinker when memory
		503	* goes low. It scans as quickly as possible avoiding locked inodes or those
		504	* already being flushed, and once done schedules a future pass.
		505	*/
		506	STATIC void
		507	xfs_reclaim_worker(
		508	struct work_struct *work)
		509	{
		510	struct xfs_mount *mp = container_of(to_delayed_work(work),
		511	struct xfs_mount, m_reclaim_work);
		512
		513	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
		514	xfs_syncd_queue_reclaim(mp);
		515	}
		516
		517	/*
473	* Flush delayed allocate data, attempting to free up reserved space	518	* Flush delayed allocate data, attempting to free up reserved space
474	* from existing allocations. At this point a new allocation attempt	519	* from existing allocations. At this point a new allocation attempt
475	* has failed with ENOSPC and we are in the process of scratching our	520	* has failed with ENOSPC and we are in the process of scratching our
@@ -508,7 +553,10 @@ xfs_syncd_init(
508	{	553	{
509	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);	554	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
510	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);	555	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
		556	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
		557
511	xfs_syncd_queue_sync(mp);	558	xfs_syncd_queue_sync(mp);
		559	xfs_syncd_queue_reclaim(mp);
512		560
513	return 0;	561	return 0;
514	}	562	}
@@ -518,6 +566,7 @@ xfs_syncd_stop(
518	struct xfs_mount *mp)	566	struct xfs_mount *mp)
519	{	567	{
520	cancel_delayed_work_sync(&mp->m_sync_work);	568	cancel_delayed_work_sync(&mp->m_sync_work);
		569	cancel_delayed_work_sync(&mp->m_reclaim_work);
521	cancel_work_sync(&mp->m_flush_work);	570	cancel_work_sync(&mp->m_flush_work);
522	}	571	}
523		572
@@ -537,6 +586,10 @@ __xfs_inode_set_reclaim_tag(
537	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),	586	XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
538	XFS_ICI_RECLAIM_TAG);	587	XFS_ICI_RECLAIM_TAG);
539	spin_unlock(&ip->i_mount->m_perag_lock);	588	spin_unlock(&ip->i_mount->m_perag_lock);
		589
		590	/* schedule periodic background inode reclaim */
		591	xfs_syncd_queue_reclaim(ip->i_mount);
		592
540	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,	593	trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
541	-1, _RET_IP_);	594	-1, _RET_IP_);
542	}	595	}
@@ -953,7 +1006,13 @@ xfs_reclaim_inodes(
953	}	1006	}
954		1007
955	/*	1008	/*
956	* Shrinker infrastructure.	1009	* Inode cache shrinker.
		1010	*
		1011	* When called we make sure that there is a background (fast) inode reclaim in
		1012	* progress, while we will throttle the speed of reclaim via doiing synchronous
		1013	* reclaim of inodes. That means if we come across dirty inodes, we wait for
		1014	* them to be cleaned, which we hope will not be very long due to the
		1015	* background walker having already kicked the IO off on those dirty inodes.
957	*/	1016	*/
958	static int	1017	static int
959	xfs_reclaim_inode_shrink(	1018	xfs_reclaim_inode_shrink(
@@ -968,10 +1027,14 @@ xfs_reclaim_inode_shrink(
968		1027
969	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);	1028	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
970	if (nr_to_scan) {	1029	if (nr_to_scan) {
		1030	/* kick background reclaimer */
		1031	xfs_syncd_queue_reclaim(mp);
		1032
971	if (!(gfp_mask & __GFP_FS))	1033	if (!(gfp_mask & __GFP_FS))
972	return -1;	1034	return -1;
973		1035
974	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);	1036	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK \| SYNC_WAIT,
		1037	&nr_to_scan);
975	/* terminate if we don't exhaust the scan */	1038	/* terminate if we don't exhaust the scan */
976	if (nr_to_scan > 0)	1039	if (nr_to_scan > 0)
977	return -1;	1040	return -1;


diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a0ad90e9529..19af0ab0d0c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h
@@ -204,6 +204,7 @@ typedef struct xfs_mount {
204	#endif	204	#endif
205	struct xfs_mru_cache m_filestream; / per-mount filestream data */	205	struct xfs_mru_cache m_filestream; / per-mount filestream data */
206	struct delayed_work m_sync_work; /* background sync work */	206	struct delayed_work m_sync_work; /* background sync work */
		207	struct delayed_work m_reclaim_work; /* background inode reclaim */
207	struct work_struct m_flush_work; /* background inode flush */	208	struct work_struct m_flush_work; /* background inode flush */
208	__int64_t m_update_flags; /* sb flags we need to update	209	__int64_t m_update_flags; /* sb flags we need to update
209	on the next remount,rw */	210	on the next remount,rw */