aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmstat.c
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2007-05-09 05:35:14 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-09 15:30:56 -0400
commit4037d452202e34214e8a939fa5621b2b3bbb45b7 (patch)
tree31b59c0ca94fba4d53b6738b0bad3d1e9fde3063 /mm/vmstat.c
parent77461ab33229d48614402decfb1b2eaa6d446861 (diff)
Move remote node draining out of slab allocators
Currently the slab allocators contain callbacks into the page allocator to perform the draining of pagesets on remote nodes. This requires SLUB to have a whole subsystem in order to be compatible with SLAB. Moving node draining out of the slab allocators avoids a section of code in SLUB. Move the node draining so that is is done when the vm statistics are updated. At that point we are already touching all the cachelines with the pagesets of a processor. Add a expire counter there. If we have to update per zone or global vm statistics then assume that the pageset will require subsequent draining. The expire counter will be decremented on each vm stats update pass until it reaches zero. Then we will drain one batch from the pageset. The draining will cause vm counter updates which will then cause another expiration until the pcp is empty. So we will drain a batch every 3 seconds. Note that remote node draining is a somewhat esoteric feature that is required on large NUMA systems because otherwise significant portions of system memory can become trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c54
1 files changed, 49 insertions, 5 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 006eb7621869..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281 281
282/* 282/*
283 * Update the zone counters for one cpu. 283 * Update the zone counters for one cpu.
284 *
285 * Note that refresh_cpu_vm_stats strives to only access
286 * node local memory. The per cpu pagesets on remote zones are placed
287 * in the memory local to the processor using that pageset. So the
288 * loop over all zones will access a series of cachelines local to
289 * the processor.
290 *
291 * The call to zone_page_state_add updates the cachelines with the
292 * statistics in the remote zone struct as well as the global cachelines
293 * with the global counters. These could cause remote node cache line
294 * bouncing and will have to be only done when necessary.
284 */ 295 */
285void refresh_cpu_vm_stats(int cpu) 296void refresh_cpu_vm_stats(int cpu)
286{ 297{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289 unsigned long flags; 300 unsigned long flags;
290 301
291 for_each_zone(zone) { 302 for_each_zone(zone) {
292 struct per_cpu_pageset *pcp; 303 struct per_cpu_pageset *p;
293 304
294 if (!populated_zone(zone)) 305 if (!populated_zone(zone))
295 continue; 306 continue;
296 307
297 pcp = zone_pcp(zone, cpu); 308 p = zone_pcp(zone, cpu);
298 309
299 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 310 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300 if (pcp->vm_stat_diff[i]) { 311 if (p->vm_stat_diff[i]) {
301 local_irq_save(flags); 312 local_irq_save(flags);
302 zone_page_state_add(pcp->vm_stat_diff[i], 313 zone_page_state_add(p->vm_stat_diff[i],
303 zone, i); 314 zone, i);
304 pcp->vm_stat_diff[i] = 0; 315 p->vm_stat_diff[i] = 0;
316#ifdef CONFIG_NUMA
317 /* 3 seconds idle till flush */
318 p->expire = 3;
319#endif
305 local_irq_restore(flags); 320 local_irq_restore(flags);
306 } 321 }
322#ifdef CONFIG_NUMA
323 /*
324 * Deal with draining the remote pageset of this
325 * processor
326 *
327 * Check if there are pages remaining in this pageset
328 * if not then there is nothing to expire.
329 */
330 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
331 continue;
332
333 /*
334 * We never drain zones local to this processor.
335 */
336 if (zone_to_nid(zone) == numa_node_id()) {
337 p->expire = 0;
338 continue;
339 }
340
341 p->expire--;
342 if (p->expire)
343 continue;
344
345 if (p->pcp[0].count)
346 drain_zone_pages(zone, p->pcp + 0);
347
348 if (p->pcp[1].count)
349 drain_zone_pages(zone, p->pcp + 1);
350#endif
307 } 351 }
308} 352}
309 353