Move remote node draining out of slab allocators

Currently the slab allocators contain callbacks into the page allocator to perform the draining of pagesets on remote nodes. This requires SLUB to have a whole subsystem in order to be compatible with SLAB. Moving node draining out of the slab allocators avoids a section of code in SLUB. Move the node draining so that is is done when the vm statistics are updated. At that point we are already touching all the cachelines with the pagesets of a processor. Add a expire counter there. If we have to update per zone or global vm statistics then assume that the pageset will require subsequent draining. The expire counter will be decremented on each vm stats update pass until it reaches zero. Then we will drain one batch from the pageset. The draining will cause vm counter updates which will then cause another expiration until the pcp is empty. So we will drain a batch every 3 seconds. Note that remote node draining is a somewhat esoteric feature that is required on large NUMA systems because otherwise significant portions of system memory can become trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Christoph Lameter <clameter@sgi.com> 2007-05-09 05:35:14 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-09 15:30:56 -0400
commit: 4037d452202e34214e8a939fa5621b2b3bbb45b7 (patch)
tree: 31b59c0ca94fba4d53b6738b0bad3d1e9fde3063
parent: 77461ab33229d48614402decfb1b2eaa6d446861 (diff)
6 files changed, 67 insertions, 131 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
 #define free_page(addr) free_pages((addr),0)
 void page_alloc_init(void);
-#ifdef CONFIG_NUMA
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
 struct per_cpu_pageset {
        struct per_cpu_pages pcp[2];    /* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+        s8 expire;
+#endif
 #ifdef CONFIG_SMP
        s8 stat_threshold;
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d53cbf8acb8e..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
 #ifdef CONFIG_NUMA
 /*
- * Called from the slab reaper to drain pagesets on a particular node that
+ * Called from the vmstat counter updater to drain pagesets of this
- * belongs to the currently executing processor.
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
 * Note that this function must be called with the thread pinned to
 * a single processor.
 */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-        int i;
-        enum zone_type z;
        unsigned long flags;
+        int to_drain;
-        for (z = 0; z < MAX_NR_ZONES; z++) {
+        local_irq_save(flags);
-                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
+        if (pcp->count >= pcp->batch)
-                struct per_cpu_pageset *pset;
+                to_drain = pcp->batch;
+        else
-                if (!populated_zone(zone))
+                to_drain = pcp->count;
-                        continue;
+        free_pages_bulk(zone, to_drain, &pcp->list, 0);
+        pcp->count -= to_drain;
-                pset = zone_pcp(zone, smp_processor_id());
+        local_irq_restore(flags);
-                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                        struct per_cpu_pages *pcp;
-                        pcp = &pset->pcp[i];
-                        if (pcp->count) {
-                                int to_drain;
-                                local_irq_save(flags);
-                                if (pcp->count >= pcp->batch)
-                                        to_drain = pcp->batch;
-                                else
-                                        to_drain = pcp->count;
-                                free_pages_bulk(zone, to_drain, &pcp->list, 0);
-                                pcp->count -= to_drain;
-                                local_irq_restore(flags);
-                        }
-                }
-        }
 }
 #endif
diff --git a/mm/slab.c b/mm/slab.c
index e50908b2bfac..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
 {
        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
diff --git a/mm/slub.c b/mm/slub.c
index dbb206503a8d..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
 #endif
-#ifdef CONFIG_NUMA
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-static DEFINE_PER_CPU(unsigned long, reap_node);
-static void init_reap_node(int cpu)
-{
-        int node;
-        node = next_node(cpu_to_node(cpu), node_online_map);
-        if (node == MAX_NUMNODES)
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-static void next_reap_node(void)
-{
-        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
-        node = next_node(node, node_online_map);
-        if (unlikely(node >= MAX_NUMNODES))
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-#define REAPTIMEOUT_CPUC        (2*HZ)
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-static void cache_reap(struct work_struct *unused)
-{
-        next_reap_node();
-        schedule_delayed_work(&__get_cpu_var(reap_work),
-                                      REAPTIMEOUT_CPUC);
-}
-static void __devinit start_cpu_timer(int cpu)
-{
-        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-        /*
-         * When this gets called from do_initcalls via cpucache_init(),
-         * init_workqueues() has already run, so keventd will be setup
-         * at that time.
-         */
-        if (keventd_up() && reap_work->work.func == NULL) {
-                init_reap_node(cpu);
-                INIT_DELAYED_WORK(reap_work, cache_reap);
-                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-        }
-}
-static int __init cpucache_init(void)
-{
-        int cpu;
-        /*
-         * Register the timers that drain pcp pages and update vm statistics
-         */
-        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
-        return 0;
-}
-__initcall(cpucache_init);
-#endif
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
        struct kmem_cache *s = get_slab(size, gfpflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 006eb7621869..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
 /*
 * Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
 */
 void refresh_cpu_vm_stats(int cpu)
 {
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
        unsigned long flags;
        for_each_zone(zone) {
-                struct per_cpu_pageset *pcp;
+                struct per_cpu_pageset *p;
                if (!populated_zone(zone))
                        continue;
-                pcp = zone_pcp(zone, cpu);
+                p = zone_pcp(zone, cpu);
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                        if (pcp->vm_stat_diff[i]) {
+                        if (p->vm_stat_diff[i]) {
                                local_irq_save(flags);
-                                zone_page_state_add(pcp->vm_stat_diff[i],
+                                zone_page_state_add(p->vm_stat_diff[i],
                                        zone, i);
-                                pcp->vm_stat_diff[i] = 0;
+                                p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+                                /* 3 seconds idle till flush */
+                                p->expire = 3;
+#endif
                                local_irq_restore(flags);
                        }
+#ifdef CONFIG_NUMA
+                /*
+                 * Deal with draining the remote pageset of this
+                 * processor
+                 *
+                 * Check if there are pages remaining in this pageset
+                 * if not then there is nothing to expire.
+                 */
+                if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+                        continue;
+                /*
+                 * We never drain zones local to this processor.
+                 */
+                if (zone_to_nid(zone) == numa_node_id()) {
+                        p->expire = 0;
+                        continue;
+                }
+                p->expire--;
+                if (p->expire)
+                        continue;
+                if (p->pcp[0].count)
+                        drain_zone_pages(zone, p->pcp + 0);
+                if (p->pcp[1].count)
+                        drain_zone_pages(zone, p->pcp + 1);
+#endif
        }
 }
author	Christoph Lameter <clameter@sgi.com>	2007-05-09 05:35:14 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-09 15:30:56 -0400
commit	4037d452202e34214e8a939fa5621b2b3bbb45b7 (patch)
tree	31b59c0ca94fba4d53b6738b0bad3d1e9fde3063
parent	77461ab33229d48614402decfb1b2eaa6d446861 (diff)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 97a36c3d96e2..0d2ef0b082a6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
176	#define free_page(addr) free_pages((addr),0)	176	#define free_page(addr) free_pages((addr),0)
177		177
178	void page_alloc_init(void);	178	void page_alloc_init(void);
179	#ifdef CONFIG_NUMA	179	void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp);
180	void drain_node_pages(int node);
181	#else
182	static inline void drain_node_pages(int node) { };
183	#endif
184		180
185	#endif /* __LINUX_GFP_H */	181	#endif /* __LINUX_GFP_H */


diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f1544e83042..d09b1345a3a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
83		83
84	struct per_cpu_pageset {	84	struct per_cpu_pageset {
85	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */	85	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
		86	#ifdef CONFIG_NUMA
		87	s8 expire;
		88	#endif
86	#ifdef CONFIG_SMP	89	#ifdef CONFIG_SMP
87	s8 stat_threshold;	90	s8 stat_threshold;
88	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];	91	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d53cbf8acb8e..f9b5d6d5f4d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691		691
692	#ifdef CONFIG_NUMA	692	#ifdef CONFIG_NUMA
693	/*	693	/*
694	* Called from the slab reaper to drain pagesets on a particular node that	694	* Called from the vmstat counter updater to drain pagesets of this
695	* belongs to the currently executing processor.	695	* currently executing processor on remote nodes after they have
		696	* expired.
		697	*
696	* Note that this function must be called with the thread pinned to	698	* Note that this function must be called with the thread pinned to
697	* a single processor.	699	* a single processor.
698	*/	700	*/
699	void drain_node_pages(int nodeid)	701	void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp)
700	{	702	{
701	int i;
702	enum zone_type z;
703	unsigned long flags;	703	unsigned long flags;
		704	int to_drain;
704		705
705	for (z = 0; z < MAX_NR_ZONES; z++) {	706	local_irq_save(flags);
706	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;	707	if (pcp->count >= pcp->batch)
707	struct per_cpu_pageset *pset;	708	to_drain = pcp->batch;
708		709	else
709	if (!populated_zone(zone))	710	to_drain = pcp->count;
710	continue;	711	free_pages_bulk(zone, to_drain, &pcp->list, 0);
711		712	pcp->count -= to_drain;
712	pset = zone_pcp(zone, smp_processor_id());	713	local_irq_restore(flags);
713	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714	struct per_cpu_pages *pcp;
715
716	pcp = &pset->pcp[i];
717	if (pcp->count) {
718	int to_drain;
719
720	local_irq_save(flags);
721	if (pcp->count >= pcp->batch)
722	to_drain = pcp->batch;
723	else
724	to_drain = pcp->count;
725	free_pages_bulk(zone, to_drain, &pcp->list, 0);
726	pcp->count -= to_drain;
727	local_irq_restore(flags);
728	}
729	}
730	}
731	}	714	}
732	#endif	715	#endif
733		716


diff --git a/mm/slab.c b/mm/slab.c index e50908b2bfac..944b20581f8c 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928	{	928	{
929	int node = __get_cpu_var(reap_node);	929	int node = __get_cpu_var(reap_node);
930		930
931	/*
932	* Also drain per cpu pages on remote zones
933	*/
934	if (node != numa_node_id())
935	drain_node_pages(node);
936
937	node = next_node(node, node_online_map);	931	node = next_node(node, node_online_map);
938	if (unlikely(node >= MAX_NUMNODES))	932	if (unlikely(node >= MAX_NUMNODES))
939	node = first_node(node_online_map);	933	node = first_node(node_online_map);


diff --git a/mm/slub.c b/mm/slub.c index dbb206503a8d..bd2efae02bcd 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2530		2530
2531	#endif	2531	#endif
2532		2532
2533	#ifdef CONFIG_NUMA
2534
2535	/*****************************************************************
2536	* Generic reaper used to support the page allocator
2537	* (the cpu slabs are reaped by a per slab workqueue).
2538	*
2539	* Maybe move this to the page allocator?
2540	****************************************************************/
2541
2542	static DEFINE_PER_CPU(unsigned long, reap_node);
2543
2544	static void init_reap_node(int cpu)
2545	{
2546	int node;
2547
2548	node = next_node(cpu_to_node(cpu), node_online_map);
2549	if (node == MAX_NUMNODES)
2550	node = first_node(node_online_map);
2551
2552	__get_cpu_var(reap_node) = node;
2553	}
2554
2555	static void next_reap_node(void)
2556	{
2557	int node = __get_cpu_var(reap_node);
2558
2559	/*
2560	* Also drain per cpu pages on remote zones
2561	*/
2562	if (node != numa_node_id())
2563	drain_node_pages(node);
2564
2565	node = next_node(node, node_online_map);
2566	if (unlikely(node >= MAX_NUMNODES))
2567	node = first_node(node_online_map);
2568	__get_cpu_var(reap_node) = node;
2569	}
2570	#else
2571	#define init_reap_node(cpu) do { } while (0)
2572	#define next_reap_node(void) do { } while (0)
2573	#endif
2574
2575	#define REAPTIMEOUT_CPUC (2*HZ)
2576
2577	#ifdef CONFIG_SMP
2578	static DEFINE_PER_CPU(struct delayed_work, reap_work);
2579
2580	static void cache_reap(struct work_struct *unused)
2581	{
2582	next_reap_node();
2583	schedule_delayed_work(&__get_cpu_var(reap_work),
2584	REAPTIMEOUT_CPUC);
2585	}
2586
2587	static void __devinit start_cpu_timer(int cpu)
2588	{
2589	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2590
2591	/*
2592	* When this gets called from do_initcalls via cpucache_init(),
2593	* init_workqueues() has already run, so keventd will be setup
2594	* at that time.
2595	*/
2596	if (keventd_up() && reap_work->work.func == NULL) {
2597	init_reap_node(cpu);
2598	INIT_DELAYED_WORK(reap_work, cache_reap);
2599	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2600	}
2601	}
2602
2603	static int __init cpucache_init(void)
2604	{
2605	int cpu;
2606
2607	/*
2608	* Register the timers that drain pcp pages and update vm statistics
2609	*/
2610	for_each_online_cpu(cpu)
2611	start_cpu_timer(cpu);
2612	return 0;
2613	}
2614	__initcall(cpucache_init);
2615	#endif
2616
2617	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)	2533	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
2618	{	2534	{
2619	struct kmem_cache *s = get_slab(size, gfpflags);	2535	struct kmem_cache *s = get_slab(size, gfpflags);


diff --git a/mm/vmstat.c b/mm/vmstat.c index 006eb7621869..9832d9a41d8c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281		281
282	/*	282	/*
283	* Update the zone counters for one cpu.	283	* Update the zone counters for one cpu.
		284	*
		285	* Note that refresh_cpu_vm_stats strives to only access
		286	* node local memory. The per cpu pagesets on remote zones are placed
		287	* in the memory local to the processor using that pageset. So the
		288	* loop over all zones will access a series of cachelines local to
		289	* the processor.
		290	*
		291	* The call to zone_page_state_add updates the cachelines with the
		292	* statistics in the remote zone struct as well as the global cachelines
		293	* with the global counters. These could cause remote node cache line
		294	* bouncing and will have to be only done when necessary.
284	*/	295	*/
285	void refresh_cpu_vm_stats(int cpu)	296	void refresh_cpu_vm_stats(int cpu)
286	{	297	{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289	unsigned long flags;	300	unsigned long flags;
290		301
291	for_each_zone(zone) {	302	for_each_zone(zone) {
292	struct per_cpu_pageset *pcp;	303	struct per_cpu_pageset *p;
293		304
294	if (!populated_zone(zone))	305	if (!populated_zone(zone))
295	continue;	306	continue;
296		307
297	pcp = zone_pcp(zone, cpu);	308	p = zone_pcp(zone, cpu);
298		309
299	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)	310	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300	if (pcp->vm_stat_diff[i]) {	311	if (p->vm_stat_diff[i]) {
301	local_irq_save(flags);	312	local_irq_save(flags);
302	zone_page_state_add(pcp->vm_stat_diff[i],	313	zone_page_state_add(p->vm_stat_diff[i],
303	zone, i);	314	zone, i);
304	pcp->vm_stat_diff[i] = 0;	315	p->vm_stat_diff[i] = 0;
		316	#ifdef CONFIG_NUMA
		317	/* 3 seconds idle till flush */
		318	p->expire = 3;
		319	#endif
305	local_irq_restore(flags);	320	local_irq_restore(flags);
306	}	321	}
		322	#ifdef CONFIG_NUMA
		323	/*
		324	* Deal with draining the remote pageset of this
		325	* processor
		326	*
		327	* Check if there are pages remaining in this pageset
		328	* if not then there is nothing to expire.
		329	*/
		330	if (!p->expire \|\| (!p->pcp[0].count && !p->pcp[1].count))
		331	continue;
		332
		333	/*
		334	* We never drain zones local to this processor.
		335	*/
		336	if (zone_to_nid(zone) == numa_node_id()) {
		337	p->expire = 0;
		338	continue;
		339	}
		340
		341	p->expire--;
		342	if (p->expire)
		343	continue;
		344
		345	if (p->pcp[0].count)
		346	drain_zone_pages(zone, p->pcp + 0);
		347
		348	if (p->pcp[1].count)
		349	drain_zone_pages(zone, p->pcp + 1);
		350	#endif
307	}	351	}
308	}	352	}
309		353