aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2007-05-09 05:35:14 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-09 15:30:56 -0400
commit4037d452202e34214e8a939fa5621b2b3bbb45b7 (patch)
tree31b59c0ca94fba4d53b6738b0bad3d1e9fde3063
parent77461ab33229d48614402decfb1b2eaa6d446861 (diff)
Move remote node draining out of slab allocators
Currently the slab allocators contain callbacks into the page allocator to perform the draining of pagesets on remote nodes. This requires SLUB to have a whole subsystem in order to be compatible with SLAB. Moving node draining out of the slab allocators avoids a section of code in SLUB. Move the node draining so that is is done when the vm statistics are updated. At that point we are already touching all the cachelines with the pagesets of a processor. Add a expire counter there. If we have to update per zone or global vm statistics then assume that the pageset will require subsequent draining. The expire counter will be decremented on each vm stats update pass until it reaches zero. Then we will drain one batch from the pageset. The draining will cause vm counter updates which will then cause another expiration until the pcp is empty. So we will drain a batch every 3 seconds. Note that remote node draining is a somewhat esoteric feature that is required on large NUMA systems because otherwise significant portions of system memory can become trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/gfp.h6
-rw-r--r--include/linux/mmzone.h3
-rw-r--r--mm/page_alloc.c45
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slub.c84
-rw-r--r--mm/vmstat.c54
6 files changed, 67 insertions, 131 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
176#define free_page(addr) free_pages((addr),0) 176#define free_page(addr) free_pages((addr),0)
177 177
178void page_alloc_init(void); 178void page_alloc_init(void);
179#ifdef CONFIG_NUMA 179void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
180void drain_node_pages(int node);
181#else
182static inline void drain_node_pages(int node) { };
183#endif
184 180
185#endif /* __LINUX_GFP_H */ 181#endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
83 83
84struct per_cpu_pageset { 84struct per_cpu_pageset {
85 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ 85 struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
86#ifdef CONFIG_NUMA
87 s8 expire;
88#endif
86#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
87 s8 stat_threshold; 90 s8 stat_threshold;
88 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; 91 s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d53cbf8acb8e..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691 691
692#ifdef CONFIG_NUMA 692#ifdef CONFIG_NUMA
693/* 693/*
694 * Called from the slab reaper to drain pagesets on a particular node that 694 * Called from the vmstat counter updater to drain pagesets of this
695 * belongs to the currently executing processor. 695 * currently executing processor on remote nodes after they have
696 * expired.
697 *
696 * Note that this function must be called with the thread pinned to 698 * Note that this function must be called with the thread pinned to
697 * a single processor. 699 * a single processor.
698 */ 700 */
699void drain_node_pages(int nodeid) 701void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
700{ 702{
701 int i;
702 enum zone_type z;
703 unsigned long flags; 703 unsigned long flags;
704 int to_drain;
704 705
705 for (z = 0; z < MAX_NR_ZONES; z++) { 706 local_irq_save(flags);
706 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 707 if (pcp->count >= pcp->batch)
707 struct per_cpu_pageset *pset; 708 to_drain = pcp->batch;
708 709 else
709 if (!populated_zone(zone)) 710 to_drain = pcp->count;
710 continue; 711 free_pages_bulk(zone, to_drain, &pcp->list, 0);
711 712 pcp->count -= to_drain;
712 pset = zone_pcp(zone, smp_processor_id()); 713 local_irq_restore(flags);
713 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714 struct per_cpu_pages *pcp;
715
716 pcp = &pset->pcp[i];
717 if (pcp->count) {
718 int to_drain;
719
720 local_irq_save(flags);
721 if (pcp->count >= pcp->batch)
722 to_drain = pcp->batch;
723 else
724 to_drain = pcp->count;
725 free_pages_bulk(zone, to_drain, &pcp->list, 0);
726 pcp->count -= to_drain;
727 local_irq_restore(flags);
728 }
729 }
730 }
731} 714}
732#endif 715#endif
733 716
diff --git a/mm/slab.c b/mm/slab.c
index e50908b2bfac..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928{ 928{
929 int node = __get_cpu_var(reap_node); 929 int node = __get_cpu_var(reap_node);
930 930
931 /*
932 * Also drain per cpu pages on remote zones
933 */
934 if (node != numa_node_id())
935 drain_node_pages(node);
936
937 node = next_node(node, node_online_map); 931 node = next_node(node, node_online_map);
938 if (unlikely(node >= MAX_NUMNODES)) 932 if (unlikely(node >= MAX_NUMNODES))
939 node = first_node(node_online_map); 933 node = first_node(node_online_map);
diff --git a/mm/slub.c b/mm/slub.c
index dbb206503a8d..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2530 2530
2531#endif 2531#endif
2532 2532
2533#ifdef CONFIG_NUMA
2534
2535/*****************************************************************
2536 * Generic reaper used to support the page allocator
2537 * (the cpu slabs are reaped by a per slab workqueue).
2538 *
2539 * Maybe move this to the page allocator?
2540 ****************************************************************/
2541
2542static DEFINE_PER_CPU(unsigned long, reap_node);
2543
2544static void init_reap_node(int cpu)
2545{
2546 int node;
2547
2548 node = next_node(cpu_to_node(cpu), node_online_map);
2549 if (node == MAX_NUMNODES)
2550 node = first_node(node_online_map);
2551
2552 __get_cpu_var(reap_node) = node;
2553}
2554
2555static void next_reap_node(void)
2556{
2557 int node = __get_cpu_var(reap_node);
2558
2559 /*
2560 * Also drain per cpu pages on remote zones
2561 */
2562 if (node != numa_node_id())
2563 drain_node_pages(node);
2564
2565 node = next_node(node, node_online_map);
2566 if (unlikely(node >= MAX_NUMNODES))
2567 node = first_node(node_online_map);
2568 __get_cpu_var(reap_node) = node;
2569}
2570#else
2571#define init_reap_node(cpu) do { } while (0)
2572#define next_reap_node(void) do { } while (0)
2573#endif
2574
2575#define REAPTIMEOUT_CPUC (2*HZ)
2576
2577#ifdef CONFIG_SMP
2578static DEFINE_PER_CPU(struct delayed_work, reap_work);
2579
2580static void cache_reap(struct work_struct *unused)
2581{
2582 next_reap_node();
2583 schedule_delayed_work(&__get_cpu_var(reap_work),
2584 REAPTIMEOUT_CPUC);
2585}
2586
2587static void __devinit start_cpu_timer(int cpu)
2588{
2589 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2590
2591 /*
2592 * When this gets called from do_initcalls via cpucache_init(),
2593 * init_workqueues() has already run, so keventd will be setup
2594 * at that time.
2595 */
2596 if (keventd_up() && reap_work->work.func == NULL) {
2597 init_reap_node(cpu);
2598 INIT_DELAYED_WORK(reap_work, cache_reap);
2599 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2600 }
2601}
2602
2603static int __init cpucache_init(void)
2604{
2605 int cpu;
2606
2607 /*
2608 * Register the timers that drain pcp pages and update vm statistics
2609 */
2610 for_each_online_cpu(cpu)
2611 start_cpu_timer(cpu);
2612 return 0;
2613}
2614__initcall(cpucache_init);
2615#endif
2616
2617void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) 2533void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2618{ 2534{
2619 struct kmem_cache *s = get_slab(size, gfpflags); 2535 struct kmem_cache *s = get_slab(size, gfpflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 006eb7621869..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281 281
282/* 282/*
283 * Update the zone counters for one cpu. 283 * Update the zone counters for one cpu.
284 *
285 * Note that refresh_cpu_vm_stats strives to only access
286 * node local memory. The per cpu pagesets on remote zones are placed
287 * in the memory local to the processor using that pageset. So the
288 * loop over all zones will access a series of cachelines local to
289 * the processor.
290 *
291 * The call to zone_page_state_add updates the cachelines with the
292 * statistics in the remote zone struct as well as the global cachelines
293 * with the global counters. These could cause remote node cache line
294 * bouncing and will have to be only done when necessary.
284 */ 295 */
285void refresh_cpu_vm_stats(int cpu) 296void refresh_cpu_vm_stats(int cpu)
286{ 297{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289 unsigned long flags; 300 unsigned long flags;
290 301
291 for_each_zone(zone) { 302 for_each_zone(zone) {
292 struct per_cpu_pageset *pcp; 303 struct per_cpu_pageset *p;
293 304
294 if (!populated_zone(zone)) 305 if (!populated_zone(zone))
295 continue; 306 continue;
296 307
297 pcp = zone_pcp(zone, cpu); 308 p = zone_pcp(zone, cpu);
298 309
299 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 310 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300 if (pcp->vm_stat_diff[i]) { 311 if (p->vm_stat_diff[i]) {
301 local_irq_save(flags); 312 local_irq_save(flags);
302 zone_page_state_add(pcp->vm_stat_diff[i], 313 zone_page_state_add(p->vm_stat_diff[i],
303 zone, i); 314 zone, i);
304 pcp->vm_stat_diff[i] = 0; 315 p->vm_stat_diff[i] = 0;
316#ifdef CONFIG_NUMA
317 /* 3 seconds idle till flush */
318 p->expire = 3;
319#endif
305 local_irq_restore(flags); 320 local_irq_restore(flags);
306 } 321 }
322#ifdef CONFIG_NUMA
323 /*
324 * Deal with draining the remote pageset of this
325 * processor
326 *
327 * Check if there are pages remaining in this pageset
328 * if not then there is nothing to expire.
329 */
330 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
331 continue;
332
333 /*
334 * We never drain zones local to this processor.
335 */
336 if (zone_to_nid(zone) == numa_node_id()) {
337 p->expire = 0;
338 continue;
339 }
340
341 p->expire--;
342 if (p->expire)
343 continue;
344
345 if (p->pcp[0].count)
346 drain_zone_pages(zone, p->pcp + 0);
347
348 if (p->pcp[1].count)
349 drain_zone_pages(zone, p->pcp + 1);
350#endif
307 } 351 }
308} 352}
309 353