6 files changed, 67 insertions, 131 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 97a36c3d96e2..0d2ef0b082a6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
 #define free_page(addr) free_pages((addr),0)
 void page_alloc_init(void);
-#ifdef CONFIG_NUMA
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f1544e83042..d09b1345a3a1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
 struct per_cpu_pageset {
        struct per_cpu_pages pcp[2];    /* 0: hot.  1: cold */
+#ifdef CONFIG_NUMA
+        s8 expire;
+#endif
 #ifdef CONFIG_SMP
        s8 stat_threshold;
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d53cbf8acb8e..f9b5d6d5f4d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
 #ifdef CONFIG_NUMA
 /*
- * Called from the slab reaper to drain pagesets on a particular node that
+ * Called from the vmstat counter updater to drain pagesets of this
- * belongs to the currently executing processor.
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
 * Note that this function must be called with the thread pinned to
 * a single processor.
 */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-        int i;
-        enum zone_type z;
        unsigned long flags;
+        int to_drain;
-        for (z = 0; z < MAX_NR_ZONES; z++) {
+        local_irq_save(flags);
-                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
+        if (pcp->count >= pcp->batch)
-                struct per_cpu_pageset *pset;
+                to_drain = pcp->batch;
+        else
-                if (!populated_zone(zone))
+                to_drain = pcp->count;
-                        continue;
+        free_pages_bulk(zone, to_drain, &pcp->list, 0);
+        pcp->count -= to_drain;
-                pset = zone_pcp(zone, smp_processor_id());
+        local_irq_restore(flags);
-                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                        struct per_cpu_pages *pcp;
-                        pcp = &pset->pcp[i];
-                        if (pcp->count) {
-                                int to_drain;
-                                local_irq_save(flags);
-                                if (pcp->count >= pcp->batch)
-                                        to_drain = pcp->batch;
-                                else
-                                        to_drain = pcp->count;
-                                free_pages_bulk(zone, to_drain, &pcp->list, 0);
-                                pcp->count -= to_drain;
-                                local_irq_restore(flags);
-                        }
-                }
-        }
 }
 #endif
diff --git a/mm/slab.c b/mm/slab.c
index e50908b2bfac..944b20581f8c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
 {
        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
diff --git a/mm/slub.c b/mm/slub.c
index dbb206503a8d..bd2efae02bcd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
 #endif
-#ifdef CONFIG_NUMA
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-static DEFINE_PER_CPU(unsigned long, reap_node);
-static void init_reap_node(int cpu)
-{
-        int node;
-        node = next_node(cpu_to_node(cpu), node_online_map);
-        if (node == MAX_NUMNODES)
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-static void next_reap_node(void)
-{
-        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
-        node = next_node(node, node_online_map);
-        if (unlikely(node >= MAX_NUMNODES))
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-#define REAPTIMEOUT_CPUC        (2*HZ)
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-static void cache_reap(struct work_struct *unused)
-{
-        next_reap_node();
-        schedule_delayed_work(&__get_cpu_var(reap_work),
-                                      REAPTIMEOUT_CPUC);
-}
-static void __devinit start_cpu_timer(int cpu)
-{
-        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-        /*
-         * When this gets called from do_initcalls via cpucache_init(),
-         * init_workqueues() has already run, so keventd will be setup
-         * at that time.
-         */
-        if (keventd_up() && reap_work->work.func == NULL) {
-                init_reap_node(cpu);
-                INIT_DELAYED_WORK(reap_work, cache_reap);
-                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-        }
-}
-static int __init cpucache_init(void)
-{
-        int cpu;
-        /*
-         * Register the timers that drain pcp pages and update vm statistics
-         */
-        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
-        return 0;
-}
-__initcall(cpucache_init);
-#endif
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
        struct kmem_cache *s = get_slab(size, gfpflags);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 006eb7621869..9832d9a41d8c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
 /*
 * Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
 */
 void refresh_cpu_vm_stats(int cpu)
 {
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
        unsigned long flags;
        for_each_zone(zone) {
-                struct per_cpu_pageset *pcp;
+                struct per_cpu_pageset *p;
                if (!populated_zone(zone))
                        continue;
-                pcp = zone_pcp(zone, cpu);
+                p = zone_pcp(zone, cpu);
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                        if (pcp->vm_stat_diff[i]) {
+                        if (p->vm_stat_diff[i]) {
                                local_irq_save(flags);
-                                zone_page_state_add(pcp->vm_stat_diff[i],
+                                zone_page_state_add(p->vm_stat_diff[i],
                                        zone, i);
-                                pcp->vm_stat_diff[i] = 0;
+                                p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+                                /* 3 seconds idle till flush */
+                                p->expire = 3;
+#endif
                                local_irq_restore(flags);
                        }
+#ifdef CONFIG_NUMA
+                /*
+                 * Deal with draining the remote pageset of this
+                 * processor
+                 *
+                 * Check if there are pages remaining in this pageset
+                 * if not then there is nothing to expire.
+                 */
+                if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+                        continue;
+                /*
+                 * We never drain zones local to this processor.
+                 */
+                if (zone_to_nid(zone) == numa_node_id()) {
+                        p->expire = 0;
+                        continue;
+                }
+                p->expire--;
+                if (p->expire)
+                        continue;
+                if (p->pcp[0].count)
+                        drain_zone_pages(zone, p->pcp + 0);
+                if (p->pcp[1].count)
+                        drain_zone_pages(zone, p->pcp + 1);
+#endif
        }
 }

diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 97a36c3d96e2..0d2ef0b082a6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h
@@ -176,10 +176,6 @@ extern void FASTCALL(free_cold_page(struct page *page));
176	#define free_page(addr) free_pages((addr),0)	176	#define free_page(addr) free_pages((addr),0)
177		177
178	void page_alloc_init(void);	178	void page_alloc_init(void);
179	#ifdef CONFIG_NUMA	179	void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp);
180	void drain_node_pages(int node);
181	#else
182	static inline void drain_node_pages(int node) { };
183	#endif
184		180
185	#endif /* __LINUX_GFP_H */	181	#endif /* __LINUX_GFP_H */


diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2f1544e83042..d09b1345a3a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -83,6 +83,9 @@ struct per_cpu_pages {
83		83
84	struct per_cpu_pageset {	84	struct per_cpu_pageset {
85	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */	85	struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
		86	#ifdef CONFIG_NUMA
		87	s8 expire;
		88	#endif
86	#ifdef CONFIG_SMP	89	#ifdef CONFIG_SMP
87	s8 stat_threshold;	90	s8 stat_threshold;
88	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];	91	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d53cbf8acb8e..f9b5d6d5f4d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -691,43 +691,26 @@ static void __init setup_nr_node_ids(void) {}
691		691
692	#ifdef CONFIG_NUMA	692	#ifdef CONFIG_NUMA
693	/*	693	/*
694	* Called from the slab reaper to drain pagesets on a particular node that	694	* Called from the vmstat counter updater to drain pagesets of this
695	* belongs to the currently executing processor.	695	* currently executing processor on remote nodes after they have
		696	* expired.
		697	*
696	* Note that this function must be called with the thread pinned to	698	* Note that this function must be called with the thread pinned to
697	* a single processor.	699	* a single processor.
698	*/	700	*/
699	void drain_node_pages(int nodeid)	701	void drain_zone_pages(struct zone zone, struct per_cpu_pages pcp)
700	{	702	{
701	int i;
702	enum zone_type z;
703	unsigned long flags;	703	unsigned long flags;
		704	int to_drain;
704		705
705	for (z = 0; z < MAX_NR_ZONES; z++) {	706	local_irq_save(flags);
706	struct zone *zone = NODE_DATA(nodeid)->node_zones + z;	707	if (pcp->count >= pcp->batch)
707	struct per_cpu_pageset *pset;	708	to_drain = pcp->batch;
708		709	else
709	if (!populated_zone(zone))	710	to_drain = pcp->count;
710	continue;	711	free_pages_bulk(zone, to_drain, &pcp->list, 0);
711		712	pcp->count -= to_drain;
712	pset = zone_pcp(zone, smp_processor_id());	713	local_irq_restore(flags);
713	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
714	struct per_cpu_pages *pcp;
715
716	pcp = &pset->pcp[i];
717	if (pcp->count) {
718	int to_drain;
719
720	local_irq_save(flags);
721	if (pcp->count >= pcp->batch)
722	to_drain = pcp->batch;
723	else
724	to_drain = pcp->count;
725	free_pages_bulk(zone, to_drain, &pcp->list, 0);
726	pcp->count -= to_drain;
727	local_irq_restore(flags);
728	}
729	}
730	}
731	}	714	}
732	#endif	715	#endif
733		716


diff --git a/mm/slab.c b/mm/slab.c index e50908b2bfac..944b20581f8c 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -928,12 +928,6 @@ static void next_reap_node(void)
928	{	928	{
929	int node = __get_cpu_var(reap_node);	929	int node = __get_cpu_var(reap_node);
930		930
931	/*
932	* Also drain per cpu pages on remote zones
933	*/
934	if (node != numa_node_id())
935	drain_node_pages(node);
936
937	node = next_node(node, node_online_map);	931	node = next_node(node, node_online_map);
938	if (unlikely(node >= MAX_NUMNODES))	932	if (unlikely(node >= MAX_NUMNODES))
939	node = first_node(node_online_map);	933	node = first_node(node_online_map);


diff --git a/mm/slub.c b/mm/slub.c index dbb206503a8d..bd2efae02bcd 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -2530,90 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
2530		2530
2531	#endif	2531	#endif
2532		2532
2533	#ifdef CONFIG_NUMA
2534
2535	/*****************************************************************
2536	* Generic reaper used to support the page allocator
2537	* (the cpu slabs are reaped by a per slab workqueue).
2538	*
2539	* Maybe move this to the page allocator?
2540	****************************************************************/
2541
2542	static DEFINE_PER_CPU(unsigned long, reap_node);
2543
2544	static void init_reap_node(int cpu)
2545	{
2546	int node;
2547
2548	node = next_node(cpu_to_node(cpu), node_online_map);
2549	if (node == MAX_NUMNODES)
2550	node = first_node(node_online_map);
2551
2552	__get_cpu_var(reap_node) = node;
2553	}
2554
2555	static void next_reap_node(void)
2556	{
2557	int node = __get_cpu_var(reap_node);
2558
2559	/*
2560	* Also drain per cpu pages on remote zones
2561	*/
2562	if (node != numa_node_id())
2563	drain_node_pages(node);
2564
2565	node = next_node(node, node_online_map);
2566	if (unlikely(node >= MAX_NUMNODES))
2567	node = first_node(node_online_map);
2568	__get_cpu_var(reap_node) = node;
2569	}
2570	#else
2571	#define init_reap_node(cpu) do { } while (0)
2572	#define next_reap_node(void) do { } while (0)
2573	#endif
2574
2575	#define REAPTIMEOUT_CPUC (2*HZ)
2576
2577	#ifdef CONFIG_SMP
2578	static DEFINE_PER_CPU(struct delayed_work, reap_work);
2579
2580	static void cache_reap(struct work_struct *unused)
2581	{
2582	next_reap_node();
2583	schedule_delayed_work(&__get_cpu_var(reap_work),
2584	REAPTIMEOUT_CPUC);
2585	}
2586
2587	static void __devinit start_cpu_timer(int cpu)
2588	{
2589	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
2590
2591	/*
2592	* When this gets called from do_initcalls via cpucache_init(),
2593	* init_workqueues() has already run, so keventd will be setup
2594	* at that time.
2595	*/
2596	if (keventd_up() && reap_work->work.func == NULL) {
2597	init_reap_node(cpu);
2598	INIT_DELAYED_WORK(reap_work, cache_reap);
2599	schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
2600	}
2601	}
2602
2603	static int __init cpucache_init(void)
2604	{
2605	int cpu;
2606
2607	/*
2608	* Register the timers that drain pcp pages and update vm statistics
2609	*/
2610	for_each_online_cpu(cpu)
2611	start_cpu_timer(cpu);
2612	return 0;
2613	}
2614	__initcall(cpucache_init);
2615	#endif
2616
2617	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)	2533	void __kmalloc_track_caller(size_t size, gfp_t gfpflags, void caller)
2618	{	2534	{
2619	struct kmem_cache *s = get_slab(size, gfpflags);	2535	struct kmem_cache *s = get_slab(size, gfpflags);


diff --git a/mm/vmstat.c b/mm/vmstat.c index 006eb7621869..9832d9a41d8c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -281,6 +281,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
281		281
282	/*	282	/*
283	* Update the zone counters for one cpu.	283	* Update the zone counters for one cpu.
		284	*
		285	* Note that refresh_cpu_vm_stats strives to only access
		286	* node local memory. The per cpu pagesets on remote zones are placed
		287	* in the memory local to the processor using that pageset. So the
		288	* loop over all zones will access a series of cachelines local to
		289	* the processor.
		290	*
		291	* The call to zone_page_state_add updates the cachelines with the
		292	* statistics in the remote zone struct as well as the global cachelines
		293	* with the global counters. These could cause remote node cache line
		294	* bouncing and will have to be only done when necessary.
284	*/	295	*/
285	void refresh_cpu_vm_stats(int cpu)	296	void refresh_cpu_vm_stats(int cpu)
286	{	297	{
@@ -289,21 +300,54 @@ void refresh_cpu_vm_stats(int cpu)
289	unsigned long flags;	300	unsigned long flags;
290		301
291	for_each_zone(zone) {	302	for_each_zone(zone) {
292	struct per_cpu_pageset *pcp;	303	struct per_cpu_pageset *p;
293		304
294	if (!populated_zone(zone))	305	if (!populated_zone(zone))
295	continue;	306	continue;
296		307
297	pcp = zone_pcp(zone, cpu);	308	p = zone_pcp(zone, cpu);
298		309
299	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)	310	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
300	if (pcp->vm_stat_diff[i]) {	311	if (p->vm_stat_diff[i]) {
301	local_irq_save(flags);	312	local_irq_save(flags);
302	zone_page_state_add(pcp->vm_stat_diff[i],	313	zone_page_state_add(p->vm_stat_diff[i],
303	zone, i);	314	zone, i);
304	pcp->vm_stat_diff[i] = 0;	315	p->vm_stat_diff[i] = 0;
		316	#ifdef CONFIG_NUMA
		317	/* 3 seconds idle till flush */
		318	p->expire = 3;
		319	#endif
305	local_irq_restore(flags);	320	local_irq_restore(flags);
306	}	321	}
		322	#ifdef CONFIG_NUMA
		323	/*
		324	* Deal with draining the remote pageset of this
		325	* processor
		326	*
		327	* Check if there are pages remaining in this pageset
		328	* if not then there is nothing to expire.
		329	*/
		330	if (!p->expire \|\| (!p->pcp[0].count && !p->pcp[1].count))
		331	continue;
		332
		333	/*
		334	* We never drain zones local to this processor.
		335	*/
		336	if (zone_to_nid(zone) == numa_node_id()) {
		337	p->expire = 0;
		338	continue;
		339	}
		340
		341	p->expire--;
		342	if (p->expire)
		343	continue;
		344
		345	if (p->pcp[0].count)
		346	drain_zone_pages(zone, p->pcp + 0);
		347
		348	if (p->pcp[1].count)
		349	drain_zone_pages(zone, p->pcp + 1);
		350	#endif
307	}	351	}
308	}	352	}
309		353