aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/base/node.c2
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mmzone.h11
-rw-r--r--init/main.c1
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c211
6 files changed, 195 insertions, 38 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5d4517ccc422..904b27caf697 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
87 for (i = 0; i < MAX_NR_ZONES; i++) { 87 for (i = 0; i < MAX_NR_ZONES; i++) {
88 struct zone *z = &pg->node_zones[i]; 88 struct zone *z = &pg->node_zones[i];
89 for (cpu = 0; cpu < NR_CPUS; cpu++) { 89 for (cpu = 0; cpu < NR_CPUS; cpu++) {
90 struct per_cpu_pageset *ps = &z->pageset[cpu]; 90 struct per_cpu_pageset *ps = zone_pcp(z,cpu);
91 numa_hit += ps->numa_hit; 91 numa_hit += ps->numa_hit;
92 numa_miss += ps->numa_miss; 92 numa_miss += ps->numa_miss;
93 numa_foreign += ps->numa_foreign; 93 numa_foreign += ps->numa_foreign;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17518fe0b311..1813b162b0a8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ extern void show_mem(void);
691extern void si_meminfo(struct sysinfo * val); 691extern void si_meminfo(struct sysinfo * val);
692extern void si_meminfo_node(struct sysinfo *val, int nid); 692extern void si_meminfo_node(struct sysinfo *val, int nid);
693 693
694#ifdef CONFIG_NUMA
695extern void setup_per_cpu_pageset(void);
696#else
697static inline void setup_per_cpu_pageset(void) {}
698#endif
699
694/* prio_tree.c */ 700/* prio_tree.c */
695void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); 701void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
696void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); 702void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 18fed8b67943..4733d35d8223 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,12 @@ struct per_cpu_pageset {
63#endif 63#endif
64} ____cacheline_aligned_in_smp; 64} ____cacheline_aligned_in_smp;
65 65
66#ifdef CONFIG_NUMA
67#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
68#else
69#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
70#endif
71
66#define ZONE_DMA 0 72#define ZONE_DMA 0
67#define ZONE_NORMAL 1 73#define ZONE_NORMAL 1
68#define ZONE_HIGHMEM 2 74#define ZONE_HIGHMEM 2
@@ -122,8 +128,11 @@ struct zone {
122 */ 128 */
123 unsigned long lowmem_reserve[MAX_NR_ZONES]; 129 unsigned long lowmem_reserve[MAX_NR_ZONES];
124 130
131#ifdef CONFIG_NUMA
132 struct per_cpu_pageset *pageset[NR_CPUS];
133#else
125 struct per_cpu_pageset pageset[NR_CPUS]; 134 struct per_cpu_pageset pageset[NR_CPUS];
126 135#endif
127 /* 136 /*
128 * free areas of different sizes 137 * free areas of different sizes
129 */ 138 */
diff --git a/init/main.c b/init/main.c
index 40bf367ffdf1..d324801729ba 100644
--- a/init/main.c
+++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
490 vfs_caches_init_early(); 490 vfs_caches_init_early();
491 mem_init(); 491 mem_init();
492 kmem_cache_init(); 492 kmem_cache_init();
493 setup_per_cpu_pageset();
493 numa_policy_init(); 494 numa_policy_init();
494 if (late_time_init) 495 if (late_time_init)
495 late_time_init(); 496 late_time_init();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..39252c732db2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); 721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl); 722 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) { 723 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++; 724 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
725 put_cpu(); 725 put_cpu();
726 } 726 }
727 return page; 727 return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b19254..95cbd30a67b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74#ifdef CONFIG_NUMA
75static struct per_cpu_pageset
76 pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
77#endif
78
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 79static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
75int min_free_kbytes = 1024; 80int min_free_kbytes = 1024;
76 81
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
520 for_each_zone(zone) { 525 for_each_zone(zone) {
521 struct per_cpu_pageset *pset; 526 struct per_cpu_pageset *pset;
522 527
523 pset = &zone->pageset[cpu]; 528 pset = zone_pcp(zone, cpu);
524 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 529 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525 struct per_cpu_pages *pcp; 530 struct per_cpu_pages *pcp;
526 531
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
583 588
584 local_irq_save(flags); 589 local_irq_save(flags);
585 cpu = smp_processor_id(); 590 cpu = smp_processor_id();
586 p = &z->pageset[cpu]; 591 p = zone_pcp(z,cpu);
587 if (pg == orig) { 592 if (pg == orig) {
588 z->pageset[cpu].numa_hit++; 593 p->numa_hit++;
589 } else { 594 } else {
590 p->numa_miss++; 595 p->numa_miss++;
591 zonelist->zones[0]->pageset[cpu].numa_foreign++; 596 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592 } 597 }
593 if (pg == NODE_DATA(numa_node_id())) 598 if (pg == NODE_DATA(numa_node_id()))
594 p->local_node++; 599 p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615 if (PageAnon(page)) 620 if (PageAnon(page))
616 page->mapping = NULL; 621 page->mapping = NULL;
617 free_pages_check(__FUNCTION__, page); 622 free_pages_check(__FUNCTION__, page);
618 pcp = &zone->pageset[get_cpu()].pcp[cold]; 623 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619 local_irq_save(flags); 624 local_irq_save(flags);
620 if (pcp->count >= pcp->high) 625 if (pcp->count >= pcp->high)
621 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 626 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659 if (order == 0) { 664 if (order == 0) {
660 struct per_cpu_pages *pcp; 665 struct per_cpu_pages *pcp;
661 666
662 pcp = &zone->pageset[get_cpu()].pcp[cold]; 667 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663 local_irq_save(flags); 668 local_irq_save(flags);
664 if (pcp->count <= pcp->low) 669 if (pcp->count <= pcp->low)
665 pcp->count += rmqueue_bulk(zone, 0, 670 pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
1262 if (!cpu_possible(cpu)) 1267 if (!cpu_possible(cpu))
1263 continue; 1268 continue;
1264 1269
1265 pageset = zone->pageset + cpu; 1270 pageset = zone_pcp(zone, cpu);
1266 1271
1267 for (temperature = 0; temperature < 2; temperature++) 1272 for (temperature = 0; temperature < 2; temperature++)
1268 printk("cpu %d %s: low %d, high %d, batch %d\n", 1273 printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1645 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1650 memmap_init_zone((size), (nid), (zone), (start_pfn))
1646#endif 1651#endif
1647 1652
1653static int __devinit zone_batchsize(struct zone *zone)
1654{
1655 int batch;
1656
1657 /*
1658 * The per-cpu-pages pools are set to around 1000th of the
1659 * size of the zone. But no more than 1/4 of a meg - there's
1660 * no point in going beyond the size of L2 cache.
1661 *
1662 * OK, so we don't know how big the cache is. So guess.
1663 */
1664 batch = zone->present_pages / 1024;
1665 if (batch * PAGE_SIZE > 256 * 1024)
1666 batch = (256 * 1024) / PAGE_SIZE;
1667 batch /= 4; /* We effectively *= 4 below */
1668 if (batch < 1)
1669 batch = 1;
1670
1671 /*
1672 * Clamp the batch to a 2^n - 1 value. Having a power
1673 * of 2 value was found to be more likely to have
1674 * suboptimal cache aliasing properties in some cases.
1675 *
1676 * For example if 2 tasks are alternately allocating
1677 * batches of pages, one task can end up with a lot
1678 * of pages of one half of the possible page colors
1679 * and the other with pages of the other colors.
1680 */
1681 batch = (1 << fls(batch + batch/2)) - 1;
1682 return batch;
1683}
1684
1685#ifdef CONFIG_NUMA
1686/*
1687 * Dynamicaly allocate memory for the
1688 * per cpu pageset array in struct zone.
1689 */
1690static int __devinit process_zones(int cpu)
1691{
1692 struct zone *zone, *dzone;
1693 int i;
1694
1695 for_each_zone(zone) {
1696 struct per_cpu_pageset *npageset = NULL;
1697
1698 npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
1699 GFP_KERNEL, cpu_to_node(cpu));
1700 if (!npageset) {
1701 zone->pageset[cpu] = NULL;
1702 goto bad;
1703 }
1704
1705 if (zone->pageset[cpu]) {
1706 memcpy(npageset, zone->pageset[cpu],
1707 sizeof(struct per_cpu_pageset));
1708
1709 /* Relocate lists */
1710 for (i = 0; i < 2; i++) {
1711 INIT_LIST_HEAD(&npageset->pcp[i].list);
1712 list_splice(&zone->pageset[cpu]->pcp[i].list,
1713 &npageset->pcp[i].list);
1714 }
1715 } else {
1716 struct per_cpu_pages *pcp;
1717 unsigned long batch;
1718
1719 batch = zone_batchsize(zone);
1720
1721 pcp = &npageset->pcp[0]; /* hot */
1722 pcp->count = 0;
1723 pcp->low = 2 * batch;
1724 pcp->high = 6 * batch;
1725 pcp->batch = 1 * batch;
1726 INIT_LIST_HEAD(&pcp->list);
1727
1728 pcp = &npageset->pcp[1]; /* cold*/
1729 pcp->count = 0;
1730 pcp->low = 0;
1731 pcp->high = 2 * batch;
1732 pcp->batch = 1 * batch;
1733 INIT_LIST_HEAD(&pcp->list);
1734 }
1735 zone->pageset[cpu] = npageset;
1736 }
1737
1738 return 0;
1739bad:
1740 for_each_zone(dzone) {
1741 if (dzone == zone)
1742 break;
1743 kfree(dzone->pageset[cpu]);
1744 dzone->pageset[cpu] = NULL;
1745 }
1746 return -ENOMEM;
1747}
1748
1749static inline void free_zone_pagesets(int cpu)
1750{
1751#ifdef CONFIG_NUMA
1752 struct zone *zone;
1753
1754 for_each_zone(zone) {
1755 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1756
1757 zone_pcp(zone, cpu) = NULL;
1758 kfree(pset);
1759 }
1760#endif
1761}
1762
1763static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1764 unsigned long action,
1765 void *hcpu)
1766{
1767 int cpu = (long)hcpu;
1768 int ret = NOTIFY_OK;
1769
1770 switch (action) {
1771 case CPU_UP_PREPARE:
1772 if (process_zones(cpu))
1773 ret = NOTIFY_BAD;
1774 break;
1775#ifdef CONFIG_HOTPLUG_CPU
1776 case CPU_DEAD:
1777 free_zone_pagesets(cpu);
1778 break;
1779#endif
1780 default:
1781 break;
1782 }
1783 return ret;
1784}
1785
1786static struct notifier_block pageset_notifier =
1787 { &pageset_cpuup_callback, NULL, 0 };
1788
1789void __init setup_per_cpu_pageset()
1790{
1791 int err;
1792
1793 /* Initialize per_cpu_pageset for cpu 0.
1794 * A cpuup callback will do this for every cpu
1795 * as it comes online
1796 */
1797 err = process_zones(smp_processor_id());
1798 BUG_ON(err);
1799 register_cpu_notifier(&pageset_notifier);
1800}
1801
1802#endif
1803
1648/* 1804/*
1649 * Set up the zone data structures: 1805 * Set up the zone data structures:
1650 * - mark all pages reserved 1806 * - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1687 1843
1688 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1844 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1689 1845
1690 /* 1846 batch = zone_batchsize(zone);
1691 * The per-cpu-pages pools are set to around 1000th of the
1692 * size of the zone. But no more than 1/4 of a meg - there's
1693 * no point in going beyond the size of L2 cache.
1694 *
1695 * OK, so we don't know how big the cache is. So guess.
1696 */
1697 batch = zone->present_pages / 1024;
1698 if (batch * PAGE_SIZE > 256 * 1024)
1699 batch = (256 * 1024) / PAGE_SIZE;
1700 batch /= 4; /* We effectively *= 4 below */
1701 if (batch < 1)
1702 batch = 1;
1703
1704 /*
1705 * Clamp the batch to a 2^n - 1 value. Having a power
1706 * of 2 value was found to be more likely to have
1707 * suboptimal cache aliasing properties in some cases.
1708 *
1709 * For example if 2 tasks are alternately allocating
1710 * batches of pages, one task can end up with a lot
1711 * of pages of one half of the possible page colors
1712 * and the other with pages of the other colors.
1713 */
1714 batch = (1 << fls(batch + batch/2)) - 1;
1715 1847
1716 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1848 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1717 struct per_cpu_pages *pcp; 1849 struct per_cpu_pages *pcp;
1850#ifdef CONFIG_NUMA
1851 struct per_cpu_pageset *pgset;
1852 pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
1853 (j * NR_CPUS) + cpu];
1854
1855 zone->pageset[cpu] = pgset;
1856#else
1857 struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
1858#endif
1718 1859
1719 pcp = &zone->pageset[cpu].pcp[0]; /* hot */ 1860 pcp = &pgset->pcp[0]; /* hot */
1720 pcp->count = 0; 1861 pcp->count = 0;
1721 pcp->low = 2 * batch; 1862 pcp->low = 2 * batch;
1722 pcp->high = 6 * batch; 1863 pcp->high = 6 * batch;
1723 pcp->batch = 1 * batch; 1864 pcp->batch = 1 * batch;
1724 INIT_LIST_HEAD(&pcp->list); 1865 INIT_LIST_HEAD(&pcp->list);
1725 1866
1726 pcp = &zone->pageset[cpu].pcp[1]; /* cold */ 1867 pcp = &pgset->pcp[1]; /* cold */
1727 pcp->count = 0; 1868 pcp->count = 0;
1728 pcp->low = 0; 1869 pcp->low = 0;
1729 pcp->high = 2 * batch; 1870 pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
1929 struct per_cpu_pageset *pageset; 2070 struct per_cpu_pageset *pageset;
1930 int j; 2071 int j;
1931 2072
1932 pageset = &zone->pageset[i]; 2073 pageset = zone_pcp(zone, i);
1933 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2074 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
1934 if (pageset->pcp[j].count) 2075 if (pageset->pcp[j].count)
1935 break; 2076 break;