1 files changed, 176 insertions, 35 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b19254..95cbd30a67b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
+#ifdef CONFIG_NUMA
+static struct per_cpu_pageset
+        pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
+#endif
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
        for_each_zone(zone) {
                struct per_cpu_pageset *pset;
-                pset = &zone->pageset[cpu];
+                pset = zone_pcp(zone, cpu);
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
        local_irq_save(flags);
        cpu = smp_processor_id();
-        p = &z->pageset[cpu];
+        p = zone_pcp(z,cpu);
        if (pg == orig) {
-                z->pageset[cpu].numa_hit++;
+                p->numa_hit++;
        } else {
                p->numa_miss++;
-                zonelist->zones[0]->pageset[cpu].numa_foreign++;
+                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
        }
        if (pg == NODE_DATA(numa_node_id()))
                p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
        free_pages_check(__FUNCTION__, page);
-        pcp = &zone->pageset[get_cpu()].pcp[cold];
+        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
        if (pcp->count >= pcp->high)
                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
        if (order == 0) {
                struct per_cpu_pages *pcp;
-                pcp = &zone->pageset[get_cpu()].pcp[cold];
+                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
                if (pcp->count <= pcp->low)
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
                        if (!cpu_possible(cpu))
                                continue;
-                        pageset = zone->pageset + cpu;
+                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
                                printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
+static int __devinit zone_batchsize(struct zone *zone)
+{
+        int batch;
+        /*
+         * The per-cpu-pages pools are set to around 1000th of the
+         * size of the zone.  But no more than 1/4 of a meg - there's
+         * no point in going beyond the size of L2 cache.
+         *
+         * OK, so we don't know how big the cache is.  So guess.
+         */
+        batch = zone->present_pages / 1024;
+        if (batch * PAGE_SIZE > 256 * 1024)
+                batch = (256 * 1024) / PAGE_SIZE;
+        batch /= 4;             /* We effectively *= 4 below */
+        if (batch < 1)
+                batch = 1;
+        /*
+         * Clamp the batch to a 2^n - 1 value. Having a power
+         * of 2 value was found to be more likely to have
+         * suboptimal cache aliasing properties in some cases.
+         *
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
+         */
+        batch = (1 << fls(batch + batch/2)) - 1;
+        return batch;
+}
+#ifdef CONFIG_NUMA
+/*
+ * Dynamicaly allocate memory for the
+ * per cpu pageset array in struct zone.
+ */
+static int __devinit process_zones(int cpu)
+{
+        struct zone *zone, *dzone;
+        int i;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *npageset = NULL;
+                npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
+                                         GFP_KERNEL, cpu_to_node(cpu));
+                if (!npageset) {
+                        zone->pageset[cpu] = NULL;
+                        goto bad;
+                }
+                if (zone->pageset[cpu]) {
+                        memcpy(npageset, zone->pageset[cpu],
+                                        sizeof(struct per_cpu_pageset));
+                        /* Relocate lists */
+                        for (i = 0; i < 2; i++) {
+                                INIT_LIST_HEAD(&npageset->pcp[i].list);
+                                list_splice(&zone->pageset[cpu]->pcp[i].list,
+                                        &npageset->pcp[i].list);
+                        }
+                } else {
+                        struct per_cpu_pages *pcp;
+                        unsigned long batch;
+                        batch = zone_batchsize(zone);
+                        pcp = &npageset->pcp[0];                /* hot */
+                        pcp->count = 0;
+                        pcp->low = 2 * batch;
+                        pcp->high = 6 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                        pcp = &npageset->pcp[1];                /* cold*/
+                        pcp->count = 0;
+                        pcp->low = 0;
+                        pcp->high = 2 * batch;
+                        pcp->batch = 1 * batch;
+                        INIT_LIST_HEAD(&pcp->list);
+                }
+                zone->pageset[cpu] = npageset;
+        }
+        return 0;
+bad:
+        for_each_zone(dzone) {
+                if (dzone == zone)
+                        break;
+                kfree(dzone->pageset[cpu]);
+                dzone->pageset[cpu] = NULL;
+        }
+        return -ENOMEM;
+}
+static inline void free_zone_pagesets(int cpu)
+{
+#ifdef CONFIG_NUMA
+        struct zone *zone;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
+                zone_pcp(zone, cpu) = NULL;
+                kfree(pset);
+        }
+#endif
+}
+static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+                unsigned long action,
+                void *hcpu)
+{
+        int cpu = (long)hcpu;
+        int ret = NOTIFY_OK;
+        switch (action) {
+                case CPU_UP_PREPARE:
+                        if (process_zones(cpu))
+                                ret = NOTIFY_BAD;
+                        break;
+#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_DEAD:
+                        free_zone_pagesets(cpu);
+                        break;
+#endif
+                default:
+                        break;
+        }
+        return ret;
+}
+static struct notifier_block pageset_notifier =
+        { &pageset_cpuup_callback, NULL, 0 };
+void __init setup_per_cpu_pageset()
+{
+        int err;
+        /* Initialize per_cpu_pageset for cpu 0.
+         * A cpuup callback will do this for every cpu
+         * as it comes online
+         */
+        err = process_zones(smp_processor_id());
+        BUG_ON(err);
+        register_cpu_notifier(&pageset_notifier);
+}
+#endif
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-                /*
+                batch = zone_batchsize(zone);
-                 * The per-cpu-pages pools are set to around 1000th of the
-                 * size of the zone.  But no more than 1/4 of a meg - there's
-                 * no point in going beyond the size of L2 cache.
-                 *
-                 * OK, so we don't know how big the cache is.  So guess.
-                 */
-                batch = zone->present_pages / 1024;
-                if (batch * PAGE_SIZE > 256 * 1024)
-                        batch = (256 * 1024) / PAGE_SIZE;
-                batch /= 4;             /* We effectively *= 4 below */
-                if (batch < 1)
-                        batch = 1;
-                /*
-                 * Clamp the batch to a 2^n - 1 value. Having a power
-                 * of 2 value was found to be more likely to have
-                 * suboptimal cache aliasing properties in some cases.
-                 *
-                 * For example if 2 tasks are alternately allocating
-                 * batches of pages, one task can end up with a lot
-                 * of pages of one half of the possible page colors
-                 * and the other with pages of the other colors.
-                 */
-                batch = (1 << fls(batch + batch/2)) - 1;
                for (cpu = 0; cpu < NR_CPUS; cpu++) {
                        struct per_cpu_pages *pcp;
+#ifdef CONFIG_NUMA
+                        struct per_cpu_pageset *pgset;
+                        pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
+                                        (j * NR_CPUS) + cpu];
+                        zone->pageset[cpu] = pgset;
+#else
+                        struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
+#endif
-                        pcp = &zone->pageset[cpu].pcp[0];       /* hot */
+                        pcp = &pgset->pcp[0];                   /* hot */
                        pcp->count = 0;
                        pcp->low = 2 * batch;
                        pcp->high = 6 * batch;
                        pcp->batch = 1 * batch;
                        INIT_LIST_HEAD(&pcp->list);
-                        pcp = &zone->pageset[cpu].pcp[1];       /* cold */
+                        pcp = &pgset->pcp[1];                   /* cold */
                        pcp->count = 0;
                        pcp->low = 0;
                        pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                        struct per_cpu_pageset *pageset;
                        int j;
-                        pageset = &zone->pageset[i];
+                        pageset = zone_pcp(zone, i);
                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
                                if (pageset->pcp[j].count)
                                        break;

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];	71	struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
72	EXPORT_SYMBOL(zone_table);	72	EXPORT_SYMBOL(zone_table);
73		73
		74	#ifdef CONFIG_NUMA
		75	static struct per_cpu_pageset
		76	pageset_table[MAX_NR_ZONESMAX_NUMNODESNR_CPUS] __initdata;
		77	#endif
		78
74	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };	79	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
75	int min_free_kbytes = 1024;	80	int min_free_kbytes = 1024;
76		81
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
520	for_each_zone(zone) {	525	for_each_zone(zone) {
521	struct per_cpu_pageset *pset;	526	struct per_cpu_pageset *pset;
522		527
523	pset = &zone->pageset[cpu];	528	pset = zone_pcp(zone, cpu);
524	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {	529	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525	struct per_cpu_pages *pcp;	530	struct per_cpu_pages *pcp;
526		531
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist zonelist, struct zone z)
583		588
584	local_irq_save(flags);	589	local_irq_save(flags);
585	cpu = smp_processor_id();	590	cpu = smp_processor_id();
586	p = &z->pageset[cpu];	591	p = zone_pcp(z,cpu);
587	if (pg == orig) {	592	if (pg == orig) {
588	z->pageset[cpu].numa_hit++;	593	p->numa_hit++;
589	} else {	594	} else {
590	p->numa_miss++;	595	p->numa_miss++;
591	zonelist->zones[0]->pageset[cpu].numa_foreign++;	596	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592	}	597	}
593	if (pg == NODE_DATA(numa_node_id()))	598	if (pg == NODE_DATA(numa_node_id()))
594	p->local_node++;	599	p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615	if (PageAnon(page))	620	if (PageAnon(page))
616	page->mapping = NULL;	621	page->mapping = NULL;
617	free_pages_check(__FUNCTION__, page);	622	free_pages_check(__FUNCTION__, page);
618	pcp = &zone->pageset[get_cpu()].pcp[cold];	623	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619	local_irq_save(flags);	624	local_irq_save(flags);
620	if (pcp->count >= pcp->high)	625	if (pcp->count >= pcp->high)
621	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);	626	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659	if (order == 0) {	664	if (order == 0) {
660	struct per_cpu_pages *pcp;	665	struct per_cpu_pages *pcp;
661		666
662	pcp = &zone->pageset[get_cpu()].pcp[cold];	667	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663	local_irq_save(flags);	668	local_irq_save(flags);
664	if (pcp->count <= pcp->low)	669	if (pcp->count <= pcp->low)
665	pcp->count += rmqueue_bulk(zone, 0,	670	pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
1262	if (!cpu_possible(cpu))	1267	if (!cpu_possible(cpu))
1263	continue;	1268	continue;
1264		1269
1265	pageset = zone->pageset + cpu;	1270	pageset = zone_pcp(zone, cpu);
1266		1271
1267	for (temperature = 0; temperature < 2; temperature++)	1272	for (temperature = 0; temperature < 2; temperature++)
1268	printk("cpu %d %s: low %d, high %d, batch %d\n",	1273	printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
1645	memmap_init_zone((size), (nid), (zone), (start_pfn))	1650	memmap_init_zone((size), (nid), (zone), (start_pfn))
1646	#endif	1651	#endif
1647		1652
		1653	static int __devinit zone_batchsize(struct zone *zone)
		1654	{
		1655	int batch;
		1656
		1657	/*
		1658	* The per-cpu-pages pools are set to around 1000th of the
		1659	* size of the zone. But no more than 1/4 of a meg - there's
		1660	* no point in going beyond the size of L2 cache.
		1661	*
		1662	* OK, so we don't know how big the cache is. So guess.
		1663	*/
		1664	batch = zone->present_pages / 1024;
		1665	if (batch * PAGE_SIZE > 256 * 1024)
		1666	batch = (256 * 1024) / PAGE_SIZE;
		1667	batch /= 4; /* We effectively = 4 below /
		1668	if (batch < 1)
		1669	batch = 1;
		1670
		1671	/*
		1672	* Clamp the batch to a 2^n - 1 value. Having a power
		1673	* of 2 value was found to be more likely to have
		1674	* suboptimal cache aliasing properties in some cases.
		1675	*
		1676	* For example if 2 tasks are alternately allocating
		1677	* batches of pages, one task can end up with a lot
		1678	* of pages of one half of the possible page colors
		1679	* and the other with pages of the other colors.
		1680	*/
		1681	batch = (1 << fls(batch + batch/2)) - 1;
		1682	return batch;
		1683	}
		1684
		1685	#ifdef CONFIG_NUMA
		1686	/*
		1687	* Dynamicaly allocate memory for the
		1688	* per cpu pageset array in struct zone.
		1689	*/
		1690	static int __devinit process_zones(int cpu)
		1691	{
		1692	struct zone zone, dzone;
		1693	int i;
		1694
		1695	for_each_zone(zone) {
		1696	struct per_cpu_pageset *npageset = NULL;
		1697
		1698	npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
		1699	GFP_KERNEL, cpu_to_node(cpu));
		1700	if (!npageset) {
		1701	zone->pageset[cpu] = NULL;
		1702	goto bad;
		1703	}
		1704
		1705	if (zone->pageset[cpu]) {
		1706	memcpy(npageset, zone->pageset[cpu],
		1707	sizeof(struct per_cpu_pageset));
		1708
		1709	/* Relocate lists */
		1710	for (i = 0; i < 2; i++) {
		1711	INIT_LIST_HEAD(&npageset->pcp[i].list);
		1712	list_splice(&zone->pageset[cpu]->pcp[i].list,
		1713	&npageset->pcp[i].list);
		1714	}
		1715	} else {
		1716	struct per_cpu_pages *pcp;
		1717	unsigned long batch;
		1718
		1719	batch = zone_batchsize(zone);
		1720
		1721	pcp = &npageset->pcp[0]; /* hot */
		1722	pcp->count = 0;
		1723	pcp->low = 2 * batch;
		1724	pcp->high = 6 * batch;
		1725	pcp->batch = 1 * batch;
		1726	INIT_LIST_HEAD(&pcp->list);
		1727
		1728	pcp = &npageset->pcp[1]; /* cold*/
		1729	pcp->count = 0;
		1730	pcp->low = 0;
		1731	pcp->high = 2 * batch;
		1732	pcp->batch = 1 * batch;
		1733	INIT_LIST_HEAD(&pcp->list);
		1734	}
		1735	zone->pageset[cpu] = npageset;
		1736	}
		1737
		1738	return 0;
		1739	bad:
		1740	for_each_zone(dzone) {
		1741	if (dzone == zone)
		1742	break;
		1743	kfree(dzone->pageset[cpu]);
		1744	dzone->pageset[cpu] = NULL;
		1745	}
		1746	return -ENOMEM;
		1747	}
		1748
		1749	static inline void free_zone_pagesets(int cpu)
		1750	{
		1751	#ifdef CONFIG_NUMA
		1752	struct zone *zone;
		1753
		1754	for_each_zone(zone) {
		1755	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
		1756
		1757	zone_pcp(zone, cpu) = NULL;
		1758	kfree(pset);
		1759	}
		1760	#endif
		1761	}
		1762
		1763	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
		1764	unsigned long action,
		1765	void *hcpu)
		1766	{
		1767	int cpu = (long)hcpu;
		1768	int ret = NOTIFY_OK;
		1769
		1770	switch (action) {
		1771	case CPU_UP_PREPARE:
		1772	if (process_zones(cpu))
		1773	ret = NOTIFY_BAD;
		1774	break;
		1775	#ifdef CONFIG_HOTPLUG_CPU
		1776	case CPU_DEAD:
		1777	free_zone_pagesets(cpu);
		1778	break;
		1779	#endif
		1780	default:
		1781	break;
		1782	}
		1783	return ret;
		1784	}
		1785
		1786	static struct notifier_block pageset_notifier =
		1787	{ &pageset_cpuup_callback, NULL, 0 };
		1788
		1789	void __init setup_per_cpu_pageset()
		1790	{
		1791	int err;
		1792
		1793	/* Initialize per_cpu_pageset for cpu 0.
		1794	* A cpuup callback will do this for every cpu
		1795	* as it comes online
		1796	*/
		1797	err = process_zones(smp_processor_id());
		1798	BUG_ON(err);
		1799	register_cpu_notifier(&pageset_notifier);
		1800	}
		1801
		1802	#endif
		1803
1648	/*	1804	/*
1649	* Set up the zone data structures:	1805	* Set up the zone data structures:
1650	* - mark all pages reserved	1806	* - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1687		1843
1688	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;	1844	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1689		1845
1690	/*	1846	batch = zone_batchsize(zone);
1691	* The per-cpu-pages pools are set to around 1000th of the
1692	* size of the zone. But no more than 1/4 of a meg - there's
1693	* no point in going beyond the size of L2 cache.
1694	*
1695	* OK, so we don't know how big the cache is. So guess.
1696	*/
1697	batch = zone->present_pages / 1024;
1698	if (batch * PAGE_SIZE > 256 * 1024)
1699	batch = (256 * 1024) / PAGE_SIZE;
1700	batch /= 4; /* We effectively = 4 below /
1701	if (batch < 1)
1702	batch = 1;
1703
1704	/*
1705	* Clamp the batch to a 2^n - 1 value. Having a power
1706	* of 2 value was found to be more likely to have
1707	* suboptimal cache aliasing properties in some cases.
1708	*
1709	* For example if 2 tasks are alternately allocating
1710	* batches of pages, one task can end up with a lot
1711	* of pages of one half of the possible page colors
1712	* and the other with pages of the other colors.
1713	*/
1714	batch = (1 << fls(batch + batch/2)) - 1;
1715		1847
1716	for (cpu = 0; cpu < NR_CPUS; cpu++) {	1848	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1717	struct per_cpu_pages *pcp;	1849	struct per_cpu_pages *pcp;
		1850	#ifdef CONFIG_NUMA
		1851	struct per_cpu_pageset *pgset;
		1852	pgset = &pageset_table[nidMAX_NR_ZONESNR_CPUS +
		1853	(j * NR_CPUS) + cpu];
		1854
		1855	zone->pageset[cpu] = pgset;
		1856	#else
		1857	struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
		1858	#endif
1718		1859
1719	pcp = &zone->pageset[cpu].pcp[0]; /* hot */	1860	pcp = &pgset->pcp[0]; /* hot */
1720	pcp->count = 0;	1861	pcp->count = 0;
1721	pcp->low = 2 * batch;	1862	pcp->low = 2 * batch;
1722	pcp->high = 6 * batch;	1863	pcp->high = 6 * batch;
1723	pcp->batch = 1 * batch;	1864	pcp->batch = 1 * batch;
1724	INIT_LIST_HEAD(&pcp->list);	1865	INIT_LIST_HEAD(&pcp->list);
1725		1866
1726	pcp = &zone->pageset[cpu].pcp[1]; /* cold */	1867	pcp = &pgset->pcp[1]; /* cold */
1727	pcp->count = 0;	1868	pcp->count = 0;
1728	pcp->low = 0;	1869	pcp->low = 0;
1729	pcp->high = 2 * batch;	1870	pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file m, void arg)
1929	struct per_cpu_pageset *pageset;	2070	struct per_cpu_pageset *pageset;
1930	int j;	2071	int j;
1931		2072
1932	pageset = &zone->pageset[i];	2073	pageset = zone_pcp(zone, i);
1933	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {	2074	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
1934	if (pageset->pcp[j].count)	2075	if (pageset->pcp[j].count)
1935	break;	2076	break;