6 files changed, 195 insertions, 38 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5d4517ccc422..904b27caf697 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *z = &pg->node_zones[i];
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
-                        struct per_cpu_pageset *ps = &z->pageset[cpu];
+                        struct per_cpu_pageset *ps = zone_pcp(z,cpu);
                        numa_hit += ps->numa_hit;
                        numa_miss += ps->numa_miss;
                        numa_foreign += ps->numa_foreign;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17518fe0b311..1813b162b0a8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ extern void show_mem(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
+#ifdef CONFIG_NUMA
+extern void setup_per_cpu_pageset(void);
+#else
+static inline void setup_per_cpu_pageset(void) {}
+#endif
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 18fed8b67943..4733d35d8223 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,12 @@ struct per_cpu_pageset {
 #endif
 } ____cacheline_aligned_in_smp;
+#ifdef CONFIG_NUMA
+#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
+#else
+#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
+#endif
 #define ZONE_DMA                0
 #define ZONE_NORMAL             1
 #define ZONE_HIGHMEM            2
@@ -122,8 +128,11 @@ struct zone {
         */
        unsigned long           lowmem_reserve[MAX_NR_ZONES];
+#ifdef CONFIG_NUMA
+        struct per_cpu_pageset  *pageset[NR_CPUS];
+#else
        struct per_cpu_pageset  pageset[NR_CPUS];
+#endif
        /*
         * free areas of different sizes
         */
diff --git a/init/main.c b/init/main.c
index 40bf367ffdf1..d324801729ba 100644
--- a/init/main.c
+++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
        vfs_caches_init_early();
        mem_init();
        kmem_cache_init();
+        setup_per_cpu_pageset();
        numa_policy_init();
        if (late_time_init)
                late_time_init();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..39252c732db2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
        zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
        page = __alloc_pages(gfp, order, zl);
        if (page && page_zone(page) == zl->zones[0]) {
-                zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+                zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
                put_cpu();
        }
        return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b19254..95cbd30a67b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_NUMA
+static struct per_cpu_pageset
+        pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
+#endif
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
-                pset = &zone->pageset[cpu];
+                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        p = &z->pageset[cpu];
+        p = zone_pcp(z,cpu);
        if (pg == orig) {
-                z->pageset[cpu].numa_hit++;
+                p->numa_hit++;
        } else {
                p->numa_miss++;
-                zonelist->zones[0]->pageset[cpu].numa_foreign++;
+                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
        }
        if (pg == NODE_DATA(numa_node_id()))
                p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
        free_pages_check(__FUNCTION__, page);
-        pcp = &zone->pageset[get_cpu()].pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
        if (pcp->count >= pcp->high)
                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                pcp = &zone->pageset[get_cpu()].pcp[cold];
+                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
                if (pcp->count <= pcp->low)
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
                        if (!cpu_possible(cpu))
                                continue;
-                        pageset = zone->pageset + cpu;
+                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
                                printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+        int batch;
+        /*
+         * The per-cpu-pages pools are set to around 1000th of the
+         * size of the zone.  But no more than 1/4 of a meg - there's
+         * no point in going beyond the size of L2 cache.
+         *
+         * OK, so we don't know how big the cache is.  So guess.
+         */
+        batch = zone->present_pages / 1024;
+        if (batch * PAGE_SIZE > 256 * 1024)
+                batch = (256 * 1024) / PAGE_SIZE;
+        batch /= 4;             /* We effectively *= 4 below */
+        if (batch < 1)
+                batch = 1;
+        /*
+         * Clamp the batch to a 2^n - 1 value. Having a power
+         * of 2 value was found to be more likely to have
+         * suboptimal cache aliasing properties in some cases.
+         *
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
+         */
+        batch = (1 << fls(batch + batch/2)) - 1;
+        return batch;
+}
+#ifdef CONFIG_NUMA
+/*
+ * Dynamicaly allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+        struct zone *zone, *dzone;
+        int i;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *npageset = NULL;
+                npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+                if (!npageset) {
+                        zone->pageset[cpu] = NULL;
+                        goto bad;
+                }
+                if (zone->pageset[cpu]) {
+                        memcpy(npageset, zone->pageset[cpu],
+                                        sizeof(struct per_cpu_pageset));
+                        /* Relocate lists */
+                        for (i = 0; i < 2; i++) {
+                                INIT_LIST_HEAD(&npageset->pcp[i].list);
+                                list_splice(&zone->pageset[cpu]->pcp[i].list,
+                                        &npageset->pcp[i].list);
+                        }
+                } else {
+                        struct per_cpu_pages *pcp;
+                        unsigned long batch;
+                        batch = zone_batchsize(zone);
+                        pcp = &npageset->pcp[0];                /* hot */
+                        pcp->count = 0;
+                        pcp->low = 2 * batch;
+                        pcp->high = 6 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                        pcp = &npageset->pcp[1];                /* cold*/
+                        pcp->count = 0;
+                        pcp->low = 0;
+                        pcp->high = 2 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                }
+                zone->pageset[cpu] = npageset;
+        }
+        return 0;
+bad:
+        for_each_zone(dzone) {
+                if (dzone == zone)
+                        break;
+                kfree(dzone->pageset[cpu]);
+                dzone->pageset[cpu] = NULL;
+        }
+        return -ENOMEM;
+}
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+        struct zone *zone;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                kfree(pset);
+        }
+#endif
+}
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        int cpu = (long)hcpu;
+        int ret = NOTIFY_OK;
+        switch (action) {
+                case CPU_UP_PREPARE:
+                        if (process_zones(cpu))
+                                ret = NOTIFY_BAD;
+                        break;
+#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_DEAD:
+                        free_zone_pagesets(cpu);
+                        break;
+#endif
+                default:
+                        break;
+        }
+        return ret;
+}
+static struct notifier_block pageset_notifier =
+        { &pageset_cpuup_callback, NULL, 0 };
+void __init setup_per_cpu_pageset()
+{
+        int err;
+        /* Initialize per_cpu_pageset for cpu 0.
+         * A cpuup callback will do this for every cpu
+         * as it comes online
+         */
+        err = process_zones(smp_processor_id());
+        BUG_ON(err);
+        register_cpu_notifier(&pageset_notifier);
+}
+#endif
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                /*
+                batch = zone_batchsize(zone);
-                 * The per-cpu-pages pools are set to around 1000th of the
-                 * size of the zone.  But no more than 1/4 of a meg - there's
-                 * no point in going beyond the size of L2 cache.
-                 *
-                 * OK, so we don't know how big the cache is.  So guess.
-                 */
-                batch = zone->present_pages / 1024;
-                if (batch * PAGE_SIZE > 256 * 1024)
-                        batch = (256 * 1024) / PAGE_SIZE;
-                batch /= 4;             /* We effectively *= 4 below */
-                if (batch < 1)
-                        batch = 1;
-                /*
-                 * Clamp the batch to a 2^n - 1 value. Having a power
-                 * of 2 value was found to be more likely to have
-                 * suboptimal cache aliasing properties in some cases.
-                 *
-                 * For example if 2 tasks are alternately allocating
-                 * batches of pages, one task can end up with a lot
-                 * of pages of one half of the possible page colors
-                 * and the other with pages of the other colors.
-                 */
-                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
                        struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+                        struct per_cpu_pageset *pgset;
+                        pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
+                                        (j * NR_CPUS) + cpu];
+                        zone->pageset[cpu] = pgset;
+#else
+                        struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+#endif
-                        pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                        pcp = &pgset->pcp[0];                   /* hot */
                        pcp->count = 0;
                        pcp->low = 2 * batch;
                        pcp->high = 6 * batch;
                        pcp->batch = 1 * batch;
                        INIT_LIST_HEAD(&pcp->list);
-                        pcp = &zone->pageset[cpu].pcp[1];       /* cold */
+                        pcp = &pgset->pcp[1];                   /* cold */
                        pcp->count = 0;
                        pcp->low = 0;
                        pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                        struct per_cpu_pageset *pageset;
                        int j;
-                        pageset = &zone->pageset[i];
+                        pageset = zone_pcp(zone, i);
                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
                                if (pageset->pcp[j].count)
                                        break;

diff --git a/drivers/base/node.c b/drivers/base/node.c index 5d4517ccc422..904b27caf697 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
87	for (i = 0; i < MAX_NR_ZONES; i++) {	87	for (i = 0; i < MAX_NR_ZONES; i++) {
88	struct zone *z = &pg->node_zones[i];	88	struct zone *z = &pg->node_zones[i];
89	for (cpu = 0; cpu < NR_CPUS; cpu++) {	89	for (cpu = 0; cpu < NR_CPUS; cpu++) {
90	struct per_cpu_pageset *ps = &z->pageset[cpu];	90	struct per_cpu_pageset *ps = zone_pcp(z,cpu);
91	numa_hit += ps->numa_hit;	91	numa_hit += ps->numa_hit;
92	numa_miss += ps->numa_miss;	92	numa_miss += ps->numa_miss;
93	numa_foreign += ps->numa_foreign;	93	numa_foreign += ps->numa_foreign;


diff --git a/include/linux/mm.h b/include/linux/mm.h index 17518fe0b311..1813b162b0a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ extern void show_mem(void);
691	extern void si_meminfo(struct sysinfo * val);	691	extern void si_meminfo(struct sysinfo * val);
692	extern void si_meminfo_node(struct sysinfo *val, int nid);	692	extern void si_meminfo_node(struct sysinfo *val, int nid);
693		693
		694	#ifdef CONFIG_NUMA
		695	extern void setup_per_cpu_pageset(void);
		696	#else
		697	static inline void setup_per_cpu_pageset(void) {}
		698	#endif
		699
694	/* prio_tree.c */	700	/* prio_tree.c */
695	void vma_prio_tree_add(struct vm_area_struct , struct vm_area_struct old);	701	void vma_prio_tree_add(struct vm_area_struct , struct vm_area_struct old);
696	void vma_prio_tree_insert(struct vm_area_struct , struct prio_tree_root );	702	void vma_prio_tree_insert(struct vm_area_struct , struct prio_tree_root );


diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18fed8b67943..4733d35d8223 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -63,6 +63,12 @@ struct per_cpu_pageset {
63	#endif	63	#endif
64	} ____cacheline_aligned_in_smp;	64	} ____cacheline_aligned_in_smp;
65		65
		66	#ifdef CONFIG_NUMA
		67	#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
		68	#else
		69	#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
		70	#endif
		71
66	#define ZONE_DMA 0	72	#define ZONE_DMA 0
67	#define ZONE_NORMAL 1	73	#define ZONE_NORMAL 1
68	#define ZONE_HIGHMEM 2	74	#define ZONE_HIGHMEM 2
@@ -122,8 +128,11 @@ struct zone {
122	*/	128	*/
123	unsigned long lowmem_reserve[MAX_NR_ZONES];	129	unsigned long lowmem_reserve[MAX_NR_ZONES];
124		130
		131	#ifdef CONFIG_NUMA
		132	struct per_cpu_pageset *pageset[NR_CPUS];
		133	#else
125	struct per_cpu_pageset pageset[NR_CPUS];	134	struct per_cpu_pageset pageset[NR_CPUS];
126		135	#endif
127	/*	136	/*
128	* free areas of different sizes	137	* free areas of different sizes
129	*/	138	*/


diff --git a/init/main.c b/init/main.c index 40bf367ffdf1..d324801729ba 100644 --- a/init/main.c +++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
490	vfs_caches_init_early();	490	vfs_caches_init_early();
491	mem_init();	491	mem_init();
492	kmem_cache_init();	492	kmem_cache_init();
		493	setup_per_cpu_pageset();
493	numa_policy_init();	494	numa_policy_init();
494	if (late_time_init)	495	if (late_time_init)
495	late_time_init();	496	late_time_init();


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429cf..39252c732db2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
721	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);	721	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722	page = __alloc_pages(gfp, order, zl);	722	page = __alloc_pages(gfp, order, zl);
723	if (page && page_zone(page) == zl->zones[0]) {	723	if (page && page_zone(page) == zl->zones[0]) {
724	zl->zones[0]->pageset[get_cpu()].interleave_hit++;	724	zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
725	put_cpu();	725	put_cpu();
726	}	726	}
727	return page;	727	return page;


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];	71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
72	EXPORT_SYMBOL(zone_table);	72	EXPORT_SYMBOL(zone_table);
73		73
		74	#ifdef CONFIG_NUMA
		75	static struct per_cpu_pageset
		76	pageset_table[MAX_NR_ZONESMAX_NUMNODESNR_CPUS] __initdata;
		77	#endif
		78
74	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };	79	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
75	int min_free_kbytes = 1024;	80	int min_free_kbytes = 1024;
76		81
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
520	for_each_zone(zone) {	525	for_each_zone(zone) {
521	struct per_cpu_pageset *pset;	526	struct per_cpu_pageset *pset;
522		527
523	pset = &zone->pageset[cpu];	528	pset = zone_pcp(zone, cpu);
524	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	529	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525	struct per_cpu_pages *pcp;	530	struct per_cpu_pages *pcp;
526		531
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist zonelist, struct zone z)
583		588
584	local_irq_save(flags);	589	local_irq_save(flags);
585	cpu = smp_processor_id();	590	cpu = smp_processor_id();
586	p = &z->pageset[cpu];	591	p = zone_pcp(z,cpu);
587	if (pg == orig) {	592	if (pg == orig) {
588	z->pageset[cpu].numa_hit++;	593	p->numa_hit++;
589	} else {	594	} else {
590	p->numa_miss++;	595	p->numa_miss++;
591	zonelist->zones[0]->pageset[cpu].numa_foreign++;	596	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592	}	597	}
593	if (pg == NODE_DATA(numa_node_id()))	598	if (pg == NODE_DATA(numa_node_id()))
594	p->local_node++;	599	p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615	if (PageAnon(page))	620	if (PageAnon(page))
616	page->mapping = NULL;	621	page->mapping = NULL;
617	free_pages_check(__FUNCTION__, page);	622	free_pages_check(__FUNCTION__, page);
618	pcp = &zone->pageset[get_cpu()].pcp[cold];	623	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619	local_irq_save(flags);	624	local_irq_save(flags);
620	if (pcp->count >= pcp->high)	625	if (pcp->count >= pcp->high)
621	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);	626	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659	if (order == 0) {	664	if (order == 0) {
660	struct per_cpu_pages *pcp;	665	struct per_cpu_pages *pcp;
661		666
662	pcp = &zone->pageset[get_cpu()].pcp[cold];	667	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663	local_irq_save(flags);	668	local_irq_save(flags);
664	if (pcp->count <= pcp->low)	669	if (pcp->count <= pcp->low)
665	pcp->count += rmqueue_bulk(zone, 0,	670	pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
1262	if (!cpu_possible(cpu))	1267	if (!cpu_possible(cpu))
1263	continue;	1268	continue;
1264		1269
1265	pageset = zone->pageset + cpu;	1270	pageset = zone_pcp(zone, cpu);
1266		1271
1267	for (temperature = 0; temperature < 2; temperature++)	1272	for (temperature = 0; temperature < 2; temperature++)
1268	printk("cpu %d %s: low %d, high %d, batch %d\n",	1273	printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
1645	memmap_init_zone((size), (nid), (zone), (start_pfn))	1650	memmap_init_zone((size), (nid), (zone), (start_pfn))
1646	#endif	1651	#endif
1647		1652
		1653	static int __devinit zone_batchsize(struct zone *zone)
		1654	{
		1655	int batch;
		1656
		1657	/*
		1658	* The per-cpu-pages pools are set to around 1000th of the
		1659	* size of the zone. But no more than 1/4 of a meg - there's
		1660	* no point in going beyond the size of L2 cache.
		1661	*
		1662	* OK, so we don't know how big the cache is. So guess.
		1663	*/
		1664	batch = zone->present_pages / 1024;
		1665	if (batch * PAGE_SIZE > 256 * 1024)
		1666	batch = (256 * 1024) / PAGE_SIZE;
		1667	batch /= 4; /* We effectively = 4 below /
		1668	if (batch < 1)
		1669	batch = 1;
		1670
		1671	/*
		1672	* Clamp the batch to a 2^n - 1 value. Having a power
		1673	* of 2 value was found to be more likely to have
		1674	* suboptimal cache aliasing properties in some cases.
		1675	*
		1676	* For example if 2 tasks are alternately allocating
		1677	* batches of pages, one task can end up with a lot
		1678	* of pages of one half of the possible page colors
		1679	* and the other with pages of the other colors.
		1680	*/
		1681	batch = (1 << fls(batch + batch/2)) - 1;
		1682	return batch;
		1683	}
		1684
		1685	#ifdef CONFIG_NUMA
		1686	/*
		1687	* Dynamicaly allocate memory for the
		1688	* per cpu pageset array in struct zone.
		1689	*/
		1690	static int __devinit process_zones(int cpu)
		1691	{
		1692	struct zone zone, dzone;
		1693	int i;
		1694
		1695	for_each_zone(zone) {
		1696	struct per_cpu_pageset *npageset = NULL;
		1697
		1698	npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
		1699	GFP_KERNEL, cpu_to_node(cpu));
		1700	if (!npageset) {
		1701	zone->pageset[cpu] = NULL;
		1702	goto bad;
		1703	}
		1704
		1705	if (zone->pageset[cpu]) {
		1706	memcpy(npageset, zone->pageset[cpu],
		1707	sizeof(struct per_cpu_pageset));
		1708
		1709	/* Relocate lists */
		1710	for (i = 0; i < 2; i++) {
		1711	INIT_LIST_HEAD(&npageset->pcp[i].list);
		1712	list_splice(&zone->pageset[cpu]->pcp[i].list,
		1713	&npageset->pcp[i].list);
		1714	}
		1715	} else {
		1716	struct per_cpu_pages *pcp;
		1717	unsigned long batch;
		1718
		1719	batch = zone_batchsize(zone);
		1720
		1721	pcp = &npageset->pcp[0]; /* hot */
		1722	pcp->count = 0;
		1723	pcp->low = 2 * batch;
		1724	pcp->high = 6 * batch;
		1725	pcp->batch = 1 * batch;
		1726	INIT_LIST_HEAD(&pcp->list);
		1727
		1728	pcp = &npageset->pcp[1]; /* cold*/
		1729	pcp->count = 0;
		1730	pcp->low = 0;
		1731	pcp->high = 2 * batch;
		1732	pcp->batch = 1 * batch;
		1733	INIT_LIST_HEAD(&pcp->list);
		1734	}
		1735	zone->pageset[cpu] = npageset;
		1736	}
		1737
		1738	return 0;
		1739	bad:
		1740	for_each_zone(dzone) {
		1741	if (dzone == zone)
		1742	break;
		1743	kfree(dzone->pageset[cpu]);
		1744	dzone->pageset[cpu] = NULL;
		1745	}
		1746	return -ENOMEM;
		1747	}
		1748
		1749	static inline void free_zone_pagesets(int cpu)
		1750	{
		1751	#ifdef CONFIG_NUMA
		1752	struct zone *zone;
		1753
		1754	for_each_zone(zone) {
		1755	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
		1756
		1757	zone_pcp(zone, cpu) = NULL;
		1758	kfree(pset);
		1759	}
		1760	#endif
		1761	}
		1762
		1763	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
		1764	unsigned long action,
		1765	void *hcpu)
		1766	{
		1767	int cpu = (long)hcpu;
		1768	int ret = NOTIFY_OK;
		1769
		1770	switch (action) {
		1771	case CPU_UP_PREPARE:
		1772	if (process_zones(cpu))
		1773	ret = NOTIFY_BAD;
		1774	break;
		1775	#ifdef CONFIG_HOTPLUG_CPU
		1776	case CPU_DEAD:
		1777	free_zone_pagesets(cpu);
		1778	break;
		1779	#endif
		1780	default:
		1781	break;
		1782	}
		1783	return ret;
		1784	}
		1785
		1786	static struct notifier_block pageset_notifier =
		1787	{ &pageset_cpuup_callback, NULL, 0 };
		1788
		1789	void __init setup_per_cpu_pageset()
		1790	{
		1791	int err;
		1792
		1793	/* Initialize per_cpu_pageset for cpu 0.
		1794	* A cpuup callback will do this for every cpu
		1795	* as it comes online
		1796	*/
		1797	err = process_zones(smp_processor_id());
		1798	BUG_ON(err);
		1799	register_cpu_notifier(&pageset_notifier);
		1800	}
		1801
		1802	#endif
		1803
1648	/*	1804	/*
1649	* Set up the zone data structures:	1805	* Set up the zone data structures:
1650	* - mark all pages reserved	1806	* - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1687		1843
1688	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;	1844	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1689		1845
1690	/*	1846	batch = zone_batchsize(zone);
1691	* The per-cpu-pages pools are set to around 1000th of the
1692	* size of the zone. But no more than 1/4 of a meg - there's
1693	* no point in going beyond the size of L2 cache.
1694	*
1695	* OK, so we don't know how big the cache is. So guess.
1696	*/
1697	batch = zone->present_pages / 1024;
1698	if (batch * PAGE_SIZE > 256 * 1024)
1699	batch = (256 * 1024) / PAGE_SIZE;
1700	batch /= 4; /* We effectively = 4 below /
1701	if (batch < 1)
1702	batch = 1;
1703
1704	/*
1705	* Clamp the batch to a 2^n - 1 value. Having a power
1706	* of 2 value was found to be more likely to have
1707	* suboptimal cache aliasing properties in some cases.
1708	*
1709	* For example if 2 tasks are alternately allocating
1710	* batches of pages, one task can end up with a lot
1711	* of pages of one half of the possible page colors
1712	* and the other with pages of the other colors.
1713	*/
1714	batch = (1 << fls(batch + batch/2)) - 1;
1715		1847
1716	for (cpu = 0; cpu < NR_CPUS; cpu++) {	1848	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1717	struct per_cpu_pages *pcp;	1849	struct per_cpu_pages *pcp;
		1850	#ifdef CONFIG_NUMA
		1851	struct per_cpu_pageset *pgset;
		1852	pgset = &pageset_table[nidMAX_NR_ZONESNR_CPUS +
		1853	(j * NR_CPUS) + cpu];
		1854
		1855	zone->pageset[cpu] = pgset;
		1856	#else
		1857	struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
		1858	#endif
1718		1859
1719	pcp = &zone->pageset[cpu].pcp[0]; /* hot */	1860	pcp = &pgset->pcp[0]; /* hot */
1720	pcp->count = 0;	1861	pcp->count = 0;
1721	pcp->low = 2 * batch;	1862	pcp->low = 2 * batch;
1722	pcp->high = 6 * batch;	1863	pcp->high = 6 * batch;
1723	pcp->batch = 1 * batch;	1864	pcp->batch = 1 * batch;
1724	INIT_LIST_HEAD(&pcp->list);	1865	INIT_LIST_HEAD(&pcp->list);
1725		1866
1726	pcp = &zone->pageset[cpu].pcp[1]; /* cold */	1867	pcp = &pgset->pcp[1]; /* cold */
1727	pcp->count = 0;	1868	pcp->count = 0;
1728	pcp->low = 0;	1869	pcp->low = 0;
1729	pcp->high = 2 * batch;	1870	pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file m, void arg)
1929	struct per_cpu_pageset *pageset;	2070	struct per_cpu_pageset *pageset;
1930	int j;	2071	int j;
1931		2072
1932	pageset = &zone->pageset[i];	2073	pageset = zone_pcp(zone, i);
1933	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {	2074	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
1934	if (pageset->pcp[j].count)	2075	if (pageset->pcp[j].count)
1935	break;	2076	break;