vmscan: per-node deferred work

The list_lru infrastructure already keeps per-node LRU lists in its node-specific list_lru_node arrays and provide us with a per-node API, and the shrinkers are properly equiped with node information. This means that we can now focus our shrinking effort in a single node, but the work that is deferred from one run to another is kept global at nr_in_batch. Work can be deferred, for instance, during direct reclaim under a GFP_NOFS allocation, where situation, all the filesystem shrinkers will be prevented from running and accumulate in nr_in_batch the amount of work they should have done, but could not. This creates an impedance problem, where upon node pressure, work deferred will accumulate and end up being flushed in other nodes. The problem we describe is particularly harmful in big machines, where many nodes can accumulate at the same time, all adding to the global counter nr_in_batch. As we accumulate more and more, we start to ask for the caches to flush even bigger numbers. The result is that the caches are depleted and do not stabilize. To achieve stable steady state behavior, we need to tackle it differently. In this patch we keep the deferred count per-node, in the new array nr_deferred[] (the name is also a bit more descriptive) and will never accumulate that to other nodes. Signed-off-by: Glauber Costa <glommer@openvz.org> Cc: Dave Chinner <dchinner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
author: Glauber Costa <glommer@openvz.org> 2013-08-27 20:18:04 -0400
committer: Al Viro <viro@zeniv.linux.org.uk> 2013-09-10 18:56:31 -0400
commit: 1d3d4437eae1bb2963faab427f65f90663c64aa1 (patch)
tree: 1a5aa2be9b9f260fcd5dbd70b5c4e540b177b3f3 /mm
parent: 0ce3d74450815500e31f16a0b65f6bab687985c3 (diff)
1 files changed, 140 insertions, 101 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe0d5c458440..799ebceeb4f7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
 */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-        atomic_long_set(&shrinker->nr_in_batch, 0);
+        size_t size = sizeof(*shrinker->nr_deferred);
+        /*
+         * If we only have one possible node in the system anyway, save
+         * ourselves the trouble and disable NUMA aware behavior. This way we
+         * will save memory and some small loop time later.
+         */
+        if (nr_node_ids == 1)
+                shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+        if (shrinker->flags & SHRINKER_NUMA_AWARE)
+                size *= nr_node_ids;
+        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+        if (!shrinker->nr_deferred)
+                return -ENOMEM;
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
        up_write(&shrinker_rwsem);
+        return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
 }
 #define SHRINK_BATCH 128
+static unsigned long
+shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
+                 unsigned long nr_pages_scanned, unsigned long lru_pages)
+{
+        unsigned long freed = 0;
+        unsigned long long delta;
+        long total_scan;
+        long max_pass;
+        long nr;
+        long new_nr;
+        int nid = shrinkctl->nid;
+        long batch_size = shrinker->batch ? shrinker->batch
+                                          : SHRINK_BATCH;
+        if (shrinker->count_objects)
+                max_pass = shrinker->count_objects(shrinker, shrinkctl);
+        else
+                max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
+        if (max_pass == 0)
+                return 0;
+        /*
+         * copy the current shrinker scan count into a local variable
+         * and zero it so that other concurrent shrinker invocations
+         * don't also do this scanning work.
+         */
+        nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+        total_scan = nr;
+        delta = (4 * nr_pages_scanned) / shrinker->seeks;
+        delta *= max_pass;
+        do_div(delta, lru_pages + 1);
+        total_scan += delta;
+        if (total_scan < 0) {
+                printk(KERN_ERR
+                "shrink_slab: %pF negative objects to delete nr=%ld\n",
+                       shrinker->shrink, total_scan);
+                total_scan = max_pass;
+        }
+        /*
+         * We need to avoid excessive windup on filesystem shrinkers
+         * due to large numbers of GFP_NOFS allocations causing the
+         * shrinkers to return -1 all the time. This results in a large
+         * nr being built up so when a shrink that can do some work
+         * comes along it empties the entire cache due to nr >>>
+         * max_pass.  This is bad for sustaining a working set in
+         * memory.
+         *
+         * Hence only allow the shrinker to scan the entire cache when
+         * a large delta change is calculated directly.
+         */
+        if (delta < max_pass / 4)
+                total_scan = min(total_scan, max_pass / 2);
+        /*
+         * Avoid risking looping forever due to too large nr value:
+         * never try to free more than twice the estimate number of
+         * freeable entries.
+         */
+        if (total_scan > max_pass * 2)
+                total_scan = max_pass * 2;
+        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+                                nr_pages_scanned, lru_pages,
+                                max_pass, delta, total_scan);
+        while (total_scan >= batch_size) {
+                if (shrinker->scan_objects) {
+                        unsigned long ret;
+                        shrinkctl->nr_to_scan = batch_size;
+                        ret = shrinker->scan_objects(shrinker, shrinkctl);
+                        if (ret == SHRINK_STOP)
+                                break;
+                        freed += ret;
+                } else {
+                        int nr_before;
+                        long ret;
+                        nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
+                        ret = do_shrinker_shrink(shrinker, shrinkctl,
+                                                        batch_size);
+                        if (ret == -1)
+                                break;
+                        if (ret < nr_before)
+                                freed += nr_before - ret;
+                }
+                count_vm_events(SLABS_SCANNED, batch_size);
+                total_scan -= batch_size;
+                cond_resched();
+        }
+        /*
+         * move the unused scan count back into the shrinker in a
+         * manner that handles concurrent updates. If we exhausted the
+         * scan, there is no need to do an update.
+         */
+        if (total_scan > 0)
+                new_nr = atomic_long_add_return(total_scan,
+                                                &shrinker->nr_deferred[nid]);
+        else
+                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+        trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+        return freed;
+}
 /*
 * Call the shrink functions to age shrinkable caches
 *
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
-                unsigned long long delta;
+                for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
-                long total_scan;
+                        if (!node_online(shrinkctl->nid))
-                long max_pass;
+                                continue;
-                long nr;
-                long new_nr;
-                long batch_size = shrinker->batch ? shrinker->batch
-                                                  : SHRINK_BATCH;
-                if (shrinker->count_objects)
-                        max_pass = shrinker->count_objects(shrinker, shrinkctl);
-                else
-                        max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
-                if (max_pass == 0)
-                        continue;
-                /*
-                 * copy the current shrinker scan count into a local variable
-                 * and zero it so that other concurrent shrinker invocations
-                 * don't also do this scanning work.
-                 */
-                nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-                total_scan = nr;
-                delta = (4 * nr_pages_scanned) / shrinker->seeks;
-                delta *= max_pass;
-                do_div(delta, lru_pages + 1);
-                total_scan += delta;
-                if (total_scan < 0) {
-                        printk(KERN_ERR
-                        "shrink_slab: %pF negative objects to delete nr=%ld\n",
-                               shrinker->shrink, total_scan);
-                        total_scan = max_pass;
-                }
-                /*
-                 * We need to avoid excessive windup on filesystem shrinkers
-                 * due to large numbers of GFP_NOFS allocations causing the
-                 * shrinkers to return -1 all the time. This results in a large
-                 * nr being built up so when a shrink that can do some work
-                 * comes along it empties the entire cache due to nr >>>
-                 * max_pass.  This is bad for sustaining a working set in
-                 * memory.
-                 *
-                 * Hence only allow the shrinker to scan the entire cache when
-                 * a large delta change is calculated directly.
-                 */
-                if (delta < max_pass / 4)
-                        total_scan = min(total_scan, max_pass / 2);
-                /*
-                 * Avoid risking looping forever due to too large nr value:
-                 * never try to free more than twice the estimate number of
-                 * freeable entries.
-                 */
-                if (total_scan > max_pass * 2)
-                        total_scan = max_pass * 2;
-                trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
-                                        nr_pages_scanned, lru_pages,
-                                        max_pass, delta, total_scan);
-                while (total_scan >= batch_size) {
-                        if (shrinker->scan_objects) {
-                                unsigned long ret;
-                                shrinkctl->nr_to_scan = batch_size;
-                                ret = shrinker->scan_objects(shrinker, shrinkctl);
-                                if (ret == SHRINK_STOP)
+                        if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
-                                        break;
+                            (shrinkctl->nid != 0))
-                                freed += ret;
+                                break;
-                        } else {
-                                int nr_before;
-                                long ret;
-                                nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
-                                ret = do_shrinker_shrink(shrinker, shrinkctl,
-                                                                batch_size);
-                                if (ret == -1)
-                                        break;
-                                if (ret < nr_before)
-                                        freed += nr_before - ret;
-                        }
-                        count_vm_events(SLABS_SCANNED, batch_size);
+                        freed += shrink_slab_node(shrinkctl, shrinker,
-                        total_scan -= batch_size;
+                                 nr_pages_scanned, lru_pages);
-                        cond_resched();
                }
-                /*
-                 * move the unused scan count back into the shrinker in a
-                 * manner that handles concurrent updates. If we exhausted the
-                 * scan, there is no need to do an update.
-                 */
-                if (total_scan > 0)
-                        new_nr = atomic_long_add_return(total_scan,
-                                        &shrinker->nr_in_batch);
-                else
-                        new_nr = atomic_long_read(&shrinker->nr_in_batch);
-                trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
        }
        up_read(&shrinker_rwsem);
 out:
author	Glauber Costa <glommer@openvz.org>	2013-08-27 20:18:04 -0400
committer	Al Viro <viro@zeniv.linux.org.uk>	2013-09-10 18:56:31 -0400
commit	1d3d4437eae1bb2963faab427f65f90663c64aa1 (patch)
tree	1a5aa2be9b9f260fcd5dbd70b5c4e540b177b3f3 /mm
parent	0ce3d74450815500e31f16a0b65f6bab687985c3 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index fe0d5c458440..799ebceeb4f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
155	}	155	}
156		156
157	/*	157	/*
158	* Add a shrinker callback to be called from the vm	158	* Add a shrinker callback to be called from the vm.
159	*/	159	*/
160	void register_shrinker(struct shrinker *shrinker)	160	int register_shrinker(struct shrinker *shrinker)
161	{	161	{
162	atomic_long_set(&shrinker->nr_in_batch, 0);	162	size_t size = sizeof(*shrinker->nr_deferred);
		163
		164	/*
		165	* If we only have one possible node in the system anyway, save
		166	* ourselves the trouble and disable NUMA aware behavior. This way we
		167	* will save memory and some small loop time later.
		168	*/
		169	if (nr_node_ids == 1)
		170	shrinker->flags &= ~SHRINKER_NUMA_AWARE;
		171
		172	if (shrinker->flags & SHRINKER_NUMA_AWARE)
		173	size *= nr_node_ids;
		174
		175	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
		176	if (!shrinker->nr_deferred)
		177	return -ENOMEM;
		178
163	down_write(&shrinker_rwsem);	179	down_write(&shrinker_rwsem);
164	list_add_tail(&shrinker->list, &shrinker_list);	180	list_add_tail(&shrinker->list, &shrinker_list);
165	up_write(&shrinker_rwsem);	181	up_write(&shrinker_rwsem);
		182	return 0;
166	}	183	}
167	EXPORT_SYMBOL(register_shrinker);	184	EXPORT_SYMBOL(register_shrinker);
168		185
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
186	}	203	}
187		204
188	#define SHRINK_BATCH 128	205	#define SHRINK_BATCH 128
		206
		207	static unsigned long
		208	shrink_slab_node(struct shrink_control shrinkctl, struct shrinker shrinker,
		209	unsigned long nr_pages_scanned, unsigned long lru_pages)
		210	{
		211	unsigned long freed = 0;
		212	unsigned long long delta;
		213	long total_scan;
		214	long max_pass;
		215	long nr;
		216	long new_nr;
		217	int nid = shrinkctl->nid;
		218	long batch_size = shrinker->batch ? shrinker->batch
		219	: SHRINK_BATCH;
		220
		221	if (shrinker->count_objects)
		222	max_pass = shrinker->count_objects(shrinker, shrinkctl);
		223	else
		224	max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
		225	if (max_pass == 0)
		226	return 0;
		227
		228	/*
		229	* copy the current shrinker scan count into a local variable
		230	* and zero it so that other concurrent shrinker invocations
		231	* don't also do this scanning work.
		232	*/
		233	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
		234
		235	total_scan = nr;
		236	delta = (4 * nr_pages_scanned) / shrinker->seeks;
		237	delta *= max_pass;
		238	do_div(delta, lru_pages + 1);
		239	total_scan += delta;
		240	if (total_scan < 0) {
		241	printk(KERN_ERR
		242	"shrink_slab: %pF negative objects to delete nr=%ld\n",
		243	shrinker->shrink, total_scan);
		244	total_scan = max_pass;
		245	}
		246
		247	/*
		248	* We need to avoid excessive windup on filesystem shrinkers
		249	* due to large numbers of GFP_NOFS allocations causing the
		250	* shrinkers to return -1 all the time. This results in a large
		251	* nr being built up so when a shrink that can do some work
		252	* comes along it empties the entire cache due to nr >>>
		253	* max_pass. This is bad for sustaining a working set in
		254	* memory.
		255	*
		256	* Hence only allow the shrinker to scan the entire cache when
		257	* a large delta change is calculated directly.
		258	*/
		259	if (delta < max_pass / 4)
		260	total_scan = min(total_scan, max_pass / 2);
		261
		262	/*
		263	* Avoid risking looping forever due to too large nr value:
		264	* never try to free more than twice the estimate number of
		265	* freeable entries.
		266	*/
		267	if (total_scan > max_pass * 2)
		268	total_scan = max_pass * 2;
		269
		270	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
		271	nr_pages_scanned, lru_pages,
		272	max_pass, delta, total_scan);
		273
		274	while (total_scan >= batch_size) {
		275
		276	if (shrinker->scan_objects) {
		277	unsigned long ret;
		278	shrinkctl->nr_to_scan = batch_size;
		279	ret = shrinker->scan_objects(shrinker, shrinkctl);
		280
		281	if (ret == SHRINK_STOP)
		282	break;
		283	freed += ret;
		284	} else {
		285	int nr_before;
		286	long ret;
		287
		288	nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
		289	ret = do_shrinker_shrink(shrinker, shrinkctl,
		290	batch_size);
		291	if (ret == -1)
		292	break;
		293	if (ret < nr_before)
		294	freed += nr_before - ret;
		295	}
		296
		297	count_vm_events(SLABS_SCANNED, batch_size);
		298	total_scan -= batch_size;
		299
		300	cond_resched();
		301	}
		302
		303	/*
		304	* move the unused scan count back into the shrinker in a
		305	* manner that handles concurrent updates. If we exhausted the
		306	* scan, there is no need to do an update.
		307	*/
		308	if (total_scan > 0)
		309	new_nr = atomic_long_add_return(total_scan,
		310	&shrinker->nr_deferred[nid]);
		311	else
		312	new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
		313
		314	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
		315	return freed;
		316	}
		317
189	/*	318	/*
190	* Call the shrink functions to age shrinkable caches	319	* Call the shrink functions to age shrinkable caches
191	*	320	*
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
227	}	356	}
228		357
229	list_for_each_entry(shrinker, &shrinker_list, list) {	358	list_for_each_entry(shrinker, &shrinker_list, list) {
230	unsigned long long delta;	359	for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
231	long total_scan;	360	if (!node_online(shrinkctl->nid))
232	long max_pass;	361	continue;
233	long nr;
234	long new_nr;
235	long batch_size = shrinker->batch ? shrinker->batch
236	: SHRINK_BATCH;
237
238	if (shrinker->count_objects)
239	max_pass = shrinker->count_objects(shrinker, shrinkctl);
240	else
241	max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
242	if (max_pass == 0)
243	continue;
244
245	/*
246	* copy the current shrinker scan count into a local variable
247	* and zero it so that other concurrent shrinker invocations
248	* don't also do this scanning work.
249	*/
250	nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
251
252	total_scan = nr;
253	delta = (4 * nr_pages_scanned) / shrinker->seeks;
254	delta *= max_pass;
255	do_div(delta, lru_pages + 1);
256	total_scan += delta;
257	if (total_scan < 0) {
258	printk(KERN_ERR
259	"shrink_slab: %pF negative objects to delete nr=%ld\n",
260	shrinker->shrink, total_scan);
261	total_scan = max_pass;
262	}
263
264	/*
265	* We need to avoid excessive windup on filesystem shrinkers
266	* due to large numbers of GFP_NOFS allocations causing the
267	* shrinkers to return -1 all the time. This results in a large
268	* nr being built up so when a shrink that can do some work
269	* comes along it empties the entire cache due to nr >>>
270	* max_pass. This is bad for sustaining a working set in
271	* memory.
272	*
273	* Hence only allow the shrinker to scan the entire cache when
274	* a large delta change is calculated directly.
275	*/
276	if (delta < max_pass / 4)
277	total_scan = min(total_scan, max_pass / 2);
278
279	/*
280	* Avoid risking looping forever due to too large nr value:
281	* never try to free more than twice the estimate number of
282	* freeable entries.
283	*/
284	if (total_scan > max_pass * 2)
285	total_scan = max_pass * 2;
286
287	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
288	nr_pages_scanned, lru_pages,
289	max_pass, delta, total_scan);
290
291	while (total_scan >= batch_size) {
292
293	if (shrinker->scan_objects) {
294	unsigned long ret;
295	shrinkctl->nr_to_scan = batch_size;
296	ret = shrinker->scan_objects(shrinker, shrinkctl);
297		362
298	if (ret == SHRINK_STOP)	363	if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
299	break;	364	(shrinkctl->nid != 0))
300	freed += ret;	365	break;
301	} else {
302	int nr_before;
303	long ret;
304
305	nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
306	ret = do_shrinker_shrink(shrinker, shrinkctl,
307	batch_size);
308	if (ret == -1)
309	break;
310	if (ret < nr_before)
311	freed += nr_before - ret;
312	}
313		366
314	count_vm_events(SLABS_SCANNED, batch_size);	367	freed += shrink_slab_node(shrinkctl, shrinker,
315	total_scan -= batch_size;	368	nr_pages_scanned, lru_pages);
316		369
317	cond_resched();
318	}	370	}
319
320	/*
321	* move the unused scan count back into the shrinker in a
322	* manner that handles concurrent updates. If we exhausted the
323	* scan, there is no need to do an update.
324	*/
325	if (total_scan > 0)
326	new_nr = atomic_long_add_return(total_scan,
327	&shrinker->nr_in_batch);
328	else
329	new_nr = atomic_long_read(&shrinker->nr_in_batch);
330
331	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
332	}	371	}
333	up_read(&shrinker_rwsem);	372	up_read(&shrinker_rwsem);
334	out:	373	out: