aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorGlauber Costa <glommer@openvz.org>2013-08-27 20:18:04 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2013-09-10 18:56:31 -0400
commit1d3d4437eae1bb2963faab427f65f90663c64aa1 (patch)
tree1a5aa2be9b9f260fcd5dbd70b5c4e540b177b3f3 /mm
parent0ce3d74450815500e31f16a0b65f6bab687985c3 (diff)
vmscan: per-node deferred work
The list_lru infrastructure already keeps per-node LRU lists in its node-specific list_lru_node arrays and provide us with a per-node API, and the shrinkers are properly equiped with node information. This means that we can now focus our shrinking effort in a single node, but the work that is deferred from one run to another is kept global at nr_in_batch. Work can be deferred, for instance, during direct reclaim under a GFP_NOFS allocation, where situation, all the filesystem shrinkers will be prevented from running and accumulate in nr_in_batch the amount of work they should have done, but could not. This creates an impedance problem, where upon node pressure, work deferred will accumulate and end up being flushed in other nodes. The problem we describe is particularly harmful in big machines, where many nodes can accumulate at the same time, all adding to the global counter nr_in_batch. As we accumulate more and more, we start to ask for the caches to flush even bigger numbers. The result is that the caches are depleted and do not stabilize. To achieve stable steady state behavior, we need to tackle it differently. In this patch we keep the deferred count per-node, in the new array nr_deferred[] (the name is also a bit more descriptive) and will never accumulate that to other nodes. Signed-off-by: Glauber Costa <glommer@openvz.org> Cc: Dave Chinner <dchinner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Artem Bityutskiy <artem.bityutskiy@linux.intel.com> Cc: Arve Hjønnevåg <arve@android.com> Cc: Carlos Maiolino <cmaiolino@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chuck Lever <chuck.lever@oracle.com> Cc: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: David Rientjes <rientjes@google.com> Cc: Gleb Natapov <gleb@redhat.com> Cc: Greg Thelen <gthelen@google.com> Cc: J. Bruce Fields <bfields@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Stultz <john.stultz@linaro.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Kent Overstreet <koverstreet@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Thomas Hellstrom <thellstrom@vmware.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmscan.c241
1 files changed, 140 insertions, 101 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe0d5c458440..799ebceeb4f7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -155,14 +155,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
155} 155}
156 156
157/* 157/*
158 * Add a shrinker callback to be called from the vm 158 * Add a shrinker callback to be called from the vm.
159 */ 159 */
160void register_shrinker(struct shrinker *shrinker) 160int register_shrinker(struct shrinker *shrinker)
161{ 161{
162 atomic_long_set(&shrinker->nr_in_batch, 0); 162 size_t size = sizeof(*shrinker->nr_deferred);
163
164 /*
165 * If we only have one possible node in the system anyway, save
166 * ourselves the trouble and disable NUMA aware behavior. This way we
167 * will save memory and some small loop time later.
168 */
169 if (nr_node_ids == 1)
170 shrinker->flags &= ~SHRINKER_NUMA_AWARE;
171
172 if (shrinker->flags & SHRINKER_NUMA_AWARE)
173 size *= nr_node_ids;
174
175 shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
176 if (!shrinker->nr_deferred)
177 return -ENOMEM;
178
163 down_write(&shrinker_rwsem); 179 down_write(&shrinker_rwsem);
164 list_add_tail(&shrinker->list, &shrinker_list); 180 list_add_tail(&shrinker->list, &shrinker_list);
165 up_write(&shrinker_rwsem); 181 up_write(&shrinker_rwsem);
182 return 0;
166} 183}
167EXPORT_SYMBOL(register_shrinker); 184EXPORT_SYMBOL(register_shrinker);
168 185
@@ -186,6 +203,118 @@ static inline int do_shrinker_shrink(struct shrinker *shrinker,
186} 203}
187 204
188#define SHRINK_BATCH 128 205#define SHRINK_BATCH 128
206
207static unsigned long
208shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
209 unsigned long nr_pages_scanned, unsigned long lru_pages)
210{
211 unsigned long freed = 0;
212 unsigned long long delta;
213 long total_scan;
214 long max_pass;
215 long nr;
216 long new_nr;
217 int nid = shrinkctl->nid;
218 long batch_size = shrinker->batch ? shrinker->batch
219 : SHRINK_BATCH;
220
221 if (shrinker->count_objects)
222 max_pass = shrinker->count_objects(shrinker, shrinkctl);
223 else
224 max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
225 if (max_pass == 0)
226 return 0;
227
228 /*
229 * copy the current shrinker scan count into a local variable
230 * and zero it so that other concurrent shrinker invocations
231 * don't also do this scanning work.
232 */
233 nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
234
235 total_scan = nr;
236 delta = (4 * nr_pages_scanned) / shrinker->seeks;
237 delta *= max_pass;
238 do_div(delta, lru_pages + 1);
239 total_scan += delta;
240 if (total_scan < 0) {
241 printk(KERN_ERR
242 "shrink_slab: %pF negative objects to delete nr=%ld\n",
243 shrinker->shrink, total_scan);
244 total_scan = max_pass;
245 }
246
247 /*
248 * We need to avoid excessive windup on filesystem shrinkers
249 * due to large numbers of GFP_NOFS allocations causing the
250 * shrinkers to return -1 all the time. This results in a large
251 * nr being built up so when a shrink that can do some work
252 * comes along it empties the entire cache due to nr >>>
253 * max_pass. This is bad for sustaining a working set in
254 * memory.
255 *
256 * Hence only allow the shrinker to scan the entire cache when
257 * a large delta change is calculated directly.
258 */
259 if (delta < max_pass / 4)
260 total_scan = min(total_scan, max_pass / 2);
261
262 /*
263 * Avoid risking looping forever due to too large nr value:
264 * never try to free more than twice the estimate number of
265 * freeable entries.
266 */
267 if (total_scan > max_pass * 2)
268 total_scan = max_pass * 2;
269
270 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
271 nr_pages_scanned, lru_pages,
272 max_pass, delta, total_scan);
273
274 while (total_scan >= batch_size) {
275
276 if (shrinker->scan_objects) {
277 unsigned long ret;
278 shrinkctl->nr_to_scan = batch_size;
279 ret = shrinker->scan_objects(shrinker, shrinkctl);
280
281 if (ret == SHRINK_STOP)
282 break;
283 freed += ret;
284 } else {
285 int nr_before;
286 long ret;
287
288 nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
289 ret = do_shrinker_shrink(shrinker, shrinkctl,
290 batch_size);
291 if (ret == -1)
292 break;
293 if (ret < nr_before)
294 freed += nr_before - ret;
295 }
296
297 count_vm_events(SLABS_SCANNED, batch_size);
298 total_scan -= batch_size;
299
300 cond_resched();
301 }
302
303 /*
304 * move the unused scan count back into the shrinker in a
305 * manner that handles concurrent updates. If we exhausted the
306 * scan, there is no need to do an update.
307 */
308 if (total_scan > 0)
309 new_nr = atomic_long_add_return(total_scan,
310 &shrinker->nr_deferred[nid]);
311 else
312 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
313
314 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
315 return freed;
316}
317
189/* 318/*
190 * Call the shrink functions to age shrinkable caches 319 * Call the shrink functions to age shrinkable caches
191 * 320 *
@@ -227,108 +356,18 @@ unsigned long shrink_slab(struct shrink_control *shrinkctl,
227 } 356 }
228 357
229 list_for_each_entry(shrinker, &shrinker_list, list) { 358 list_for_each_entry(shrinker, &shrinker_list, list) {
230 unsigned long long delta; 359 for_each_node_mask(shrinkctl->nid, shrinkctl->nodes_to_scan) {
231 long total_scan; 360 if (!node_online(shrinkctl->nid))
232 long max_pass; 361 continue;
233 long nr;
234 long new_nr;
235 long batch_size = shrinker->batch ? shrinker->batch
236 : SHRINK_BATCH;
237
238 if (shrinker->count_objects)
239 max_pass = shrinker->count_objects(shrinker, shrinkctl);
240 else
241 max_pass = do_shrinker_shrink(shrinker, shrinkctl, 0);
242 if (max_pass == 0)
243 continue;
244
245 /*
246 * copy the current shrinker scan count into a local variable
247 * and zero it so that other concurrent shrinker invocations
248 * don't also do this scanning work.
249 */
250 nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
251
252 total_scan = nr;
253 delta = (4 * nr_pages_scanned) / shrinker->seeks;
254 delta *= max_pass;
255 do_div(delta, lru_pages + 1);
256 total_scan += delta;
257 if (total_scan < 0) {
258 printk(KERN_ERR
259 "shrink_slab: %pF negative objects to delete nr=%ld\n",
260 shrinker->shrink, total_scan);
261 total_scan = max_pass;
262 }
263
264 /*
265 * We need to avoid excessive windup on filesystem shrinkers
266 * due to large numbers of GFP_NOFS allocations causing the
267 * shrinkers to return -1 all the time. This results in a large
268 * nr being built up so when a shrink that can do some work
269 * comes along it empties the entire cache due to nr >>>
270 * max_pass. This is bad for sustaining a working set in
271 * memory.
272 *
273 * Hence only allow the shrinker to scan the entire cache when
274 * a large delta change is calculated directly.
275 */
276 if (delta < max_pass / 4)
277 total_scan = min(total_scan, max_pass / 2);
278
279 /*
280 * Avoid risking looping forever due to too large nr value:
281 * never try to free more than twice the estimate number of
282 * freeable entries.
283 */
284 if (total_scan > max_pass * 2)
285 total_scan = max_pass * 2;
286
287 trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
288 nr_pages_scanned, lru_pages,
289 max_pass, delta, total_scan);
290
291 while (total_scan >= batch_size) {
292
293 if (shrinker->scan_objects) {
294 unsigned long ret;
295 shrinkctl->nr_to_scan = batch_size;
296 ret = shrinker->scan_objects(shrinker, shrinkctl);
297 362
298 if (ret == SHRINK_STOP) 363 if (!(shrinker->flags & SHRINKER_NUMA_AWARE) &&
299 break; 364 (shrinkctl->nid != 0))
300 freed += ret; 365 break;
301 } else {
302 int nr_before;
303 long ret;
304
305 nr_before = do_shrinker_shrink(shrinker, shrinkctl, 0);
306 ret = do_shrinker_shrink(shrinker, shrinkctl,
307 batch_size);
308 if (ret == -1)
309 break;
310 if (ret < nr_before)
311 freed += nr_before - ret;
312 }
313 366
314 count_vm_events(SLABS_SCANNED, batch_size); 367 freed += shrink_slab_node(shrinkctl, shrinker,
315 total_scan -= batch_size; 368 nr_pages_scanned, lru_pages);
316 369
317 cond_resched();
318 } 370 }
319
320 /*
321 * move the unused scan count back into the shrinker in a
322 * manner that handles concurrent updates. If we exhausted the
323 * scan, there is no need to do an update.
324 */
325 if (total_scan > 0)
326 new_nr = atomic_long_add_return(total_scan,
327 &shrinker->nr_in_batch);
328 else
329 new_nr = atomic_long_read(&shrinker->nr_in_batch);
330
331 trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
332 } 371 }
333 up_read(&shrinker_rwsem); 372 up_read(&shrinker_rwsem);
334out: 373out: