[PATCH] node local per-cpu-pages

This patch modifies the way pagesets in struct zone are managed. Each zone has a per-cpu array of pagesets. So any particular CPU has some memory in each zone structure which belongs to itself. Even if that CPU is not local to that zone. So the patch relocates the pagesets for each cpu to the node that is nearest to the cpu instead of allocating the pagesets in the (possibly remote) target zone. This means that the operations to manage pages on remote zone can be done with information available locally. We play a macro trick so that non-NUMA pmachines avoid the additional pointer chase on the page allocator fastpath. AIM7 benchmark on a 32 CPU SGI Altix w/o patches: Tasks jobs/min jti jobs/min/task real cpu 1 484.68 100 484.6769 12.01 1.97 Fri Mar 25 11:01:42 2005 100 27140.46 89 271.4046 21.44 148.71 Fri Mar 25 11:02:04 2005 200 30792.02 82 153.9601 37.80 296.72 Fri Mar 25 11:02:42 2005 300 32209.27 81 107.3642 54.21 451.34 Fri Mar 25 11:03:37 2005 400 34962.83 78 87.4071 66.59 588.97 Fri Mar 25 11:04:44 2005 500 31676.92 75 63.3538 91.87 742.71 Fri Mar 25 11:06:16 2005 600 36032.69 73 60.0545 96.91 885.44 Fri Mar 25 11:07:54 2005 700 35540.43 77 50.7720 114.63 1024.28 Fri Mar 25 11:09:49 2005 800 33906.70 74 42.3834 137.32 1181.65 Fri Mar 25 11:12:06 2005 900 34120.67 73 37.9119 153.51 1325.26 Fri Mar 25 11:14:41 2005 1000 34802.37 74 34.8024 167.23 1465.26 Fri Mar 25 11:17:28 2005 with slab API changes and pageset patch: Tasks jobs/min jti jobs/min/task real cpu 1 485.00 100 485.0000 12.00 1.96 Fri Mar 25 11:46:18 2005 100 28000.96 89 280.0096 20.79 150.45 Fri Mar 25 11:46:39 2005 200 32285.80 79 161.4290 36.05 293.37 Fri Mar 25 11:47:16 2005 300 40424.15 84 134.7472 43.19 438.42 Fri Mar 25 11:47:59 2005 400 39155.01 79 97.8875 59.46 590.05 Fri Mar 25 11:48:59 2005 500 37881.25 82 75.7625 76.82 730.19 Fri Mar 25 11:50:16 2005 600 39083.14 78 65.1386 89.35 872.79 Fri Mar 25 11:51:46 2005 700 38627.83 77 55.1826 105.47 1022.46 Fri Mar 25 11:53:32 2005 800 39631.94 78 49.5399 117.48 1169.94 Fri Mar 25 11:55:30 2005 900 36903.70 79 41.0041 141.94 1310.78 Fri Mar 25 11:57:53 2005 1000 36201.23 77 36.2012 160.77 1458.31 Fri Mar 25 12:00:34 2005 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com> Signed-off-by: Shai Fultheim <Shai@Scalex86.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <christoph@lameter.com> 2005-06-21 20:14:47 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-06-21 21:46:16 -0400
commit: e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a (patch)
tree: f04f7b0d08cbc46d2f190a85904a3dd696dc6e88 /mm
parent: 63551ae0feaaa23807ebea60de1901564bbef32e (diff)
2 files changed, 177 insertions, 36 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..39252c732db2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
        zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
        page = __alloc_pages(gfp, order, zl);
        if (page && page_zone(page) == zl->zones[0]) {
-                zl->zones[0]->pageset[get_cpu()].interleave_hit++;
+                zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
                put_cpu();
        }
        return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b19254..95cbd30a67b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_NUMA
+static struct per_cpu_pageset
+        pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
+#endif
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
-                pset = &zone->pageset[cpu];
+                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        p = &z->pageset[cpu];
+        p = zone_pcp(z,cpu);
        if (pg == orig) {
-                z->pageset[cpu].numa_hit++;
+                p->numa_hit++;
        } else {
                p->numa_miss++;
-                zonelist->zones[0]->pageset[cpu].numa_foreign++;
+                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
        }
        if (pg == NODE_DATA(numa_node_id()))
                p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
        free_pages_check(__FUNCTION__, page);
-        pcp = &zone->pageset[get_cpu()].pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
        if (pcp->count >= pcp->high)
                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                pcp = &zone->pageset[get_cpu()].pcp[cold];
+                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
                if (pcp->count <= pcp->low)
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
                        if (!cpu_possible(cpu))
                                continue;
-                        pageset = zone->pageset + cpu;
+                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
                                printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+        int batch;
+        /*
+         * The per-cpu-pages pools are set to around 1000th of the
+         * size of the zone.  But no more than 1/4 of a meg - there's
+         * no point in going beyond the size of L2 cache.
+         *
+         * OK, so we don't know how big the cache is.  So guess.
+         */
+        batch = zone->present_pages / 1024;
+        if (batch * PAGE_SIZE > 256 * 1024)
+                batch = (256 * 1024) / PAGE_SIZE;
+        batch /= 4;             /* We effectively *= 4 below */
+        if (batch < 1)
+                batch = 1;
+        /*
+         * Clamp the batch to a 2^n - 1 value. Having a power
+         * of 2 value was found to be more likely to have
+         * suboptimal cache aliasing properties in some cases.
+         *
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
+         */
+        batch = (1 << fls(batch + batch/2)) - 1;
+        return batch;
+}
+#ifdef CONFIG_NUMA
+/*
+ * Dynamicaly allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+        struct zone *zone, *dzone;
+        int i;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *npageset = NULL;
+                npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+                if (!npageset) {
+                        zone->pageset[cpu] = NULL;
+                        goto bad;
+                }
+                if (zone->pageset[cpu]) {
+                        memcpy(npageset, zone->pageset[cpu],
+                                        sizeof(struct per_cpu_pageset));
+                        /* Relocate lists */
+                        for (i = 0; i < 2; i++) {
+                                INIT_LIST_HEAD(&npageset->pcp[i].list);
+                                list_splice(&zone->pageset[cpu]->pcp[i].list,
+                                        &npageset->pcp[i].list);
+                        }
+                } else {
+                        struct per_cpu_pages *pcp;
+                        unsigned long batch;
+                        batch = zone_batchsize(zone);
+                        pcp = &npageset->pcp[0];                /* hot */
+                        pcp->count = 0;
+                        pcp->low = 2 * batch;
+                        pcp->high = 6 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                        pcp = &npageset->pcp[1];                /* cold*/
+                        pcp->count = 0;
+                        pcp->low = 0;
+                        pcp->high = 2 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                }
+                zone->pageset[cpu] = npageset;
+        }
+        return 0;
+bad:
+        for_each_zone(dzone) {
+                if (dzone == zone)
+                        break;
+                kfree(dzone->pageset[cpu]);
+                dzone->pageset[cpu] = NULL;
+        }
+        return -ENOMEM;
+}
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+        struct zone *zone;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                kfree(pset);
+        }
+#endif
+}
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        int cpu = (long)hcpu;
+        int ret = NOTIFY_OK;
+        switch (action) {
+                case CPU_UP_PREPARE:
+                        if (process_zones(cpu))
+                                ret = NOTIFY_BAD;
+                        break;
+#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_DEAD:
+                        free_zone_pagesets(cpu);
+                        break;
+#endif
+                default:
+                        break;
+        }
+        return ret;
+}
+static struct notifier_block pageset_notifier =
+        { &pageset_cpuup_callback, NULL, 0 };
+void __init setup_per_cpu_pageset()
+{
+        int err;
+        /* Initialize per_cpu_pageset for cpu 0.
+         * A cpuup callback will do this for every cpu
+         * as it comes online
+         */
+        err = process_zones(smp_processor_id());
+        BUG_ON(err);
+        register_cpu_notifier(&pageset_notifier);
+}
+#endif
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                /*
+                batch = zone_batchsize(zone);
-                 * The per-cpu-pages pools are set to around 1000th of the
-                 * size of the zone.  But no more than 1/4 of a meg - there's
-                 * no point in going beyond the size of L2 cache.
-                 *
-                 * OK, so we don't know how big the cache is.  So guess.
-                 */
-                batch = zone->present_pages / 1024;
-                if (batch * PAGE_SIZE > 256 * 1024)
-                        batch = (256 * 1024) / PAGE_SIZE;
-                batch /= 4;             /* We effectively *= 4 below */
-                if (batch < 1)
-                        batch = 1;
-                /*
-                 * Clamp the batch to a 2^n - 1 value. Having a power
-                 * of 2 value was found to be more likely to have
-                 * suboptimal cache aliasing properties in some cases.
-                 *
-                 * For example if 2 tasks are alternately allocating
-                 * batches of pages, one task can end up with a lot
-                 * of pages of one half of the possible page colors
-                 * and the other with pages of the other colors.
-                 */
-                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
                        struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+                        struct per_cpu_pageset *pgset;
+                        pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
+                                        (j * NR_CPUS) + cpu];
+                        zone->pageset[cpu] = pgset;
+#else
+                        struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+#endif
-                        pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                        pcp = &pgset->pcp[0];                   /* hot */
                        pcp->count = 0;
                        pcp->low = 2 * batch;
                        pcp->high = 6 * batch;
                        pcp->batch = 1 * batch;
                        INIT_LIST_HEAD(&pcp->list);
-                        pcp = &zone->pageset[cpu].pcp[1];       /* cold */
+                        pcp = &pgset->pcp[1];                   /* cold */
                        pcp->count = 0;
                        pcp->low = 0;
                        pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                        struct per_cpu_pageset *pageset;
                        int j;
-                        pageset = &zone->pageset[i];
+                        pageset = zone_pcp(zone, i);
                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
                                if (pageset->pcp[j].count)
                                        break;
author	Christoph Lameter <christoph@lameter.com>	2005-06-21 20:14:47 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-06-21 21:46:16 -0400
commit	e7c8d5c9955a4d2e88e36b640563f5d6d5aba48a (patch)
tree	f04f7b0d08cbc46d2f190a85904a3dd696dc6e88 /mm
parent	63551ae0feaaa23807ebea60de1901564bbef32e (diff)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429cf..39252c732db2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
721	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);	721	zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722	page = __alloc_pages(gfp, order, zl);	722	page = __alloc_pages(gfp, order, zl);
723	if (page && page_zone(page) == zl->zones[0]) {	723	if (page && page_zone(page) == zl->zones[0]) {
724	zl->zones[0]->pageset[get_cpu()].interleave_hit++;	724	zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
725	put_cpu();	725	put_cpu();
726	}	726	}
727	return page;	727	return page;


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];	71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
72	EXPORT_SYMBOL(zone_table);	72	EXPORT_SYMBOL(zone_table);
73		73
		74	#ifdef CONFIG_NUMA
		75	static struct per_cpu_pageset
		76	pageset_table[MAX_NR_ZONESMAX_NUMNODESNR_CPUS] __initdata;
		77	#endif
		78
74	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };	79	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
75	int min_free_kbytes = 1024;	80	int min_free_kbytes = 1024;
76		81
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
520	for_each_zone(zone) {	525	for_each_zone(zone) {
521	struct per_cpu_pageset *pset;	526	struct per_cpu_pageset *pset;
522		527
523	pset = &zone->pageset[cpu];	528	pset = zone_pcp(zone, cpu);
524	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	529	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525	struct per_cpu_pages *pcp;	530	struct per_cpu_pages *pcp;
526		531
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist zonelist, struct zone z)
583		588
584	local_irq_save(flags);	589	local_irq_save(flags);
585	cpu = smp_processor_id();	590	cpu = smp_processor_id();
586	p = &z->pageset[cpu];	591	p = zone_pcp(z,cpu);
587	if (pg == orig) {	592	if (pg == orig) {
588	z->pageset[cpu].numa_hit++;	593	p->numa_hit++;
589	} else {	594	} else {
590	p->numa_miss++;	595	p->numa_miss++;
591	zonelist->zones[0]->pageset[cpu].numa_foreign++;	596	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592	}	597	}
593	if (pg == NODE_DATA(numa_node_id()))	598	if (pg == NODE_DATA(numa_node_id()))
594	p->local_node++;	599	p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615	if (PageAnon(page))	620	if (PageAnon(page))
616	page->mapping = NULL;	621	page->mapping = NULL;
617	free_pages_check(__FUNCTION__, page);	622	free_pages_check(__FUNCTION__, page);
618	pcp = &zone->pageset[get_cpu()].pcp[cold];	623	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619	local_irq_save(flags);	624	local_irq_save(flags);
620	if (pcp->count >= pcp->high)	625	if (pcp->count >= pcp->high)
621	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);	626	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659	if (order == 0) {	664	if (order == 0) {
660	struct per_cpu_pages *pcp;	665	struct per_cpu_pages *pcp;
661		666
662	pcp = &zone->pageset[get_cpu()].pcp[cold];	667	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663	local_irq_save(flags);	668	local_irq_save(flags);
664	if (pcp->count <= pcp->low)	669	if (pcp->count <= pcp->low)
665	pcp->count += rmqueue_bulk(zone, 0,	670	pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
1262	if (!cpu_possible(cpu))	1267	if (!cpu_possible(cpu))
1263	continue;	1268	continue;
1264		1269
1265	pageset = zone->pageset + cpu;	1270	pageset = zone_pcp(zone, cpu);
1266		1271
1267	for (temperature = 0; temperature < 2; temperature++)	1272	for (temperature = 0; temperature < 2; temperature++)
1268	printk("cpu %d %s: low %d, high %d, batch %d\n",	1273	printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
1645	memmap_init_zone((size), (nid), (zone), (start_pfn))	1650	memmap_init_zone((size), (nid), (zone), (start_pfn))
1646	#endif	1651	#endif
1647		1652
		1653	static int __devinit zone_batchsize(struct zone *zone)
		1654	{
		1655	int batch;
		1656
		1657	/*
		1658	* The per-cpu-pages pools are set to around 1000th of the
		1659	* size of the zone. But no more than 1/4 of a meg - there's
		1660	* no point in going beyond the size of L2 cache.
		1661	*
		1662	* OK, so we don't know how big the cache is. So guess.
		1663	*/
		1664	batch = zone->present_pages / 1024;
		1665	if (batch * PAGE_SIZE > 256 * 1024)
		1666	batch = (256 * 1024) / PAGE_SIZE;
		1667	batch /= 4; /* We effectively = 4 below /
		1668	if (batch < 1)
		1669	batch = 1;
		1670
		1671	/*
		1672	* Clamp the batch to a 2^n - 1 value. Having a power
		1673	* of 2 value was found to be more likely to have
		1674	* suboptimal cache aliasing properties in some cases.
		1675	*
		1676	* For example if 2 tasks are alternately allocating
		1677	* batches of pages, one task can end up with a lot
		1678	* of pages of one half of the possible page colors
		1679	* and the other with pages of the other colors.
		1680	*/
		1681	batch = (1 << fls(batch + batch/2)) - 1;
		1682	return batch;
		1683	}
		1684
		1685	#ifdef CONFIG_NUMA
		1686	/*
		1687	* Dynamicaly allocate memory for the
		1688	* per cpu pageset array in struct zone.
		1689	*/
		1690	static int __devinit process_zones(int cpu)
		1691	{
		1692	struct zone zone, dzone;
		1693	int i;
		1694
		1695	for_each_zone(zone) {
		1696	struct per_cpu_pageset *npageset = NULL;
		1697
		1698	npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
		1699	GFP_KERNEL, cpu_to_node(cpu));
		1700	if (!npageset) {
		1701	zone->pageset[cpu] = NULL;
		1702	goto bad;
		1703	}
		1704
		1705	if (zone->pageset[cpu]) {
		1706	memcpy(npageset, zone->pageset[cpu],
		1707	sizeof(struct per_cpu_pageset));
		1708
		1709	/* Relocate lists */
		1710	for (i = 0; i < 2; i++) {
		1711	INIT_LIST_HEAD(&npageset->pcp[i].list);
		1712	list_splice(&zone->pageset[cpu]->pcp[i].list,
		1713	&npageset->pcp[i].list);
		1714	}
		1715	} else {
		1716	struct per_cpu_pages *pcp;
		1717	unsigned long batch;
		1718
		1719	batch = zone_batchsize(zone);
		1720
		1721	pcp = &npageset->pcp[0]; /* hot */
		1722	pcp->count = 0;
		1723	pcp->low = 2 * batch;
		1724	pcp->high = 6 * batch;
		1725	pcp->batch = 1 * batch;
		1726	INIT_LIST_HEAD(&pcp->list);
		1727
		1728	pcp = &npageset->pcp[1]; /* cold*/
		1729	pcp->count = 0;
		1730	pcp->low = 0;
		1731	pcp->high = 2 * batch;
		1732	pcp->batch = 1 * batch;
		1733	INIT_LIST_HEAD(&pcp->list);
		1734	}
		1735	zone->pageset[cpu] = npageset;
		1736	}
		1737
		1738	return 0;
		1739	bad:
		1740	for_each_zone(dzone) {
		1741	if (dzone == zone)
		1742	break;
		1743	kfree(dzone->pageset[cpu]);
		1744	dzone->pageset[cpu] = NULL;
		1745	}
		1746	return -ENOMEM;
		1747	}
		1748
		1749	static inline void free_zone_pagesets(int cpu)
		1750	{
		1751	#ifdef CONFIG_NUMA
		1752	struct zone *zone;
		1753
		1754	for_each_zone(zone) {
		1755	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
		1756
		1757	zone_pcp(zone, cpu) = NULL;
		1758	kfree(pset);
		1759	}
		1760	#endif
		1761	}
		1762
		1763	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
		1764	unsigned long action,
		1765	void *hcpu)
		1766	{
		1767	int cpu = (long)hcpu;
		1768	int ret = NOTIFY_OK;
		1769
		1770	switch (action) {
		1771	case CPU_UP_PREPARE:
		1772	if (process_zones(cpu))
		1773	ret = NOTIFY_BAD;
		1774	break;
		1775	#ifdef CONFIG_HOTPLUG_CPU
		1776	case CPU_DEAD:
		1777	free_zone_pagesets(cpu);
		1778	break;
		1779	#endif
		1780	default:
		1781	break;
		1782	}
		1783	return ret;
		1784	}
		1785
		1786	static struct notifier_block pageset_notifier =
		1787	{ &pageset_cpuup_callback, NULL, 0 };
		1788
		1789	void __init setup_per_cpu_pageset()
		1790	{
		1791	int err;
		1792
		1793	/* Initialize per_cpu_pageset for cpu 0.
		1794	* A cpuup callback will do this for every cpu
		1795	* as it comes online
		1796	*/
		1797	err = process_zones(smp_processor_id());
		1798	BUG_ON(err);
		1799	register_cpu_notifier(&pageset_notifier);
		1800	}
		1801
		1802	#endif
		1803
1648	/*	1804	/*
1649	* Set up the zone data structures:	1805	* Set up the zone data structures:
1650	* - mark all pages reserved	1806	* - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1687		1843
1688	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;	1844	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1689		1845
1690	/*	1846	batch = zone_batchsize(zone);
1691	* The per-cpu-pages pools are set to around 1000th of the
1692	* size of the zone. But no more than 1/4 of a meg - there's
1693	* no point in going beyond the size of L2 cache.
1694	*
1695	* OK, so we don't know how big the cache is. So guess.
1696	*/
1697	batch = zone->present_pages / 1024;
1698	if (batch * PAGE_SIZE > 256 * 1024)
1699	batch = (256 * 1024) / PAGE_SIZE;
1700	batch /= 4; /* We effectively = 4 below /
1701	if (batch < 1)
1702	batch = 1;
1703
1704	/*
1705	* Clamp the batch to a 2^n - 1 value. Having a power
1706	* of 2 value was found to be more likely to have
1707	* suboptimal cache aliasing properties in some cases.
1708	*
1709	* For example if 2 tasks are alternately allocating
1710	* batches of pages, one task can end up with a lot
1711	* of pages of one half of the possible page colors
1712	* and the other with pages of the other colors.
1713	*/
1714	batch = (1 << fls(batch + batch/2)) - 1;
1715		1847
1716	for (cpu = 0; cpu < NR_CPUS; cpu++) {	1848	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1717	struct per_cpu_pages *pcp;	1849	struct per_cpu_pages *pcp;
		1850	#ifdef CONFIG_NUMA
		1851	struct per_cpu_pageset *pgset;
		1852	pgset = &pageset_table[nidMAX_NR_ZONESNR_CPUS +
		1853	(j * NR_CPUS) + cpu];
		1854
		1855	zone->pageset[cpu] = pgset;
		1856	#else
		1857	struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
		1858	#endif
1718		1859
1719	pcp = &zone->pageset[cpu].pcp[0]; /* hot */	1860	pcp = &pgset->pcp[0]; /* hot */
1720	pcp->count = 0;	1861	pcp->count = 0;
1721	pcp->low = 2 * batch;	1862	pcp->low = 2 * batch;
1722	pcp->high = 6 * batch;	1863	pcp->high = 6 * batch;
1723	pcp->batch = 1 * batch;	1864	pcp->batch = 1 * batch;
1724	INIT_LIST_HEAD(&pcp->list);	1865	INIT_LIST_HEAD(&pcp->list);
1725		1866
1726	pcp = &zone->pageset[cpu].pcp[1]; /* cold */	1867	pcp = &pgset->pcp[1]; /* cold */
1727	pcp->count = 0;	1868	pcp->count = 0;
1728	pcp->low = 0;	1869	pcp->low = 0;
1729	pcp->high = 2 * batch;	1870	pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file m, void arg)
1929	struct per_cpu_pageset *pageset;	2070	struct per_cpu_pageset *pageset;
1930	int j;	2071	int j;
1931		2072
1932	pageset = &zone->pageset[i];	2073	pageset = zone_pcp(zone, i);
1933	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {	2074	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
1934	if (pageset->pcp[j].count)	2075	if (pageset->pcp[j].count)
1935	break;	2076	break;