aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <christoph@lameter.com>2005-06-21 20:14:47 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-21 21:46:16 -0400
commite7c8d5c9955a4d2e88e36b640563f5d6d5aba48a (patch)
treef04f7b0d08cbc46d2f190a85904a3dd696dc6e88
parent63551ae0feaaa23807ebea60de1901564bbef32e (diff)
[PATCH] node local per-cpu-pages
This patch modifies the way pagesets in struct zone are managed. Each zone has a per-cpu array of pagesets. So any particular CPU has some memory in each zone structure which belongs to itself. Even if that CPU is not local to that zone. So the patch relocates the pagesets for each cpu to the node that is nearest to the cpu instead of allocating the pagesets in the (possibly remote) target zone. This means that the operations to manage pages on remote zone can be done with information available locally. We play a macro trick so that non-NUMA pmachines avoid the additional pointer chase on the page allocator fastpath. AIM7 benchmark on a 32 CPU SGI Altix w/o patches: Tasks jobs/min jti jobs/min/task real cpu 1 484.68 100 484.6769 12.01 1.97 Fri Mar 25 11:01:42 2005 100 27140.46 89 271.4046 21.44 148.71 Fri Mar 25 11:02:04 2005 200 30792.02 82 153.9601 37.80 296.72 Fri Mar 25 11:02:42 2005 300 32209.27 81 107.3642 54.21 451.34 Fri Mar 25 11:03:37 2005 400 34962.83 78 87.4071 66.59 588.97 Fri Mar 25 11:04:44 2005 500 31676.92 75 63.3538 91.87 742.71 Fri Mar 25 11:06:16 2005 600 36032.69 73 60.0545 96.91 885.44 Fri Mar 25 11:07:54 2005 700 35540.43 77 50.7720 114.63 1024.28 Fri Mar 25 11:09:49 2005 800 33906.70 74 42.3834 137.32 1181.65 Fri Mar 25 11:12:06 2005 900 34120.67 73 37.9119 153.51 1325.26 Fri Mar 25 11:14:41 2005 1000 34802.37 74 34.8024 167.23 1465.26 Fri Mar 25 11:17:28 2005 with slab API changes and pageset patch: Tasks jobs/min jti jobs/min/task real cpu 1 485.00 100 485.0000 12.00 1.96 Fri Mar 25 11:46:18 2005 100 28000.96 89 280.0096 20.79 150.45 Fri Mar 25 11:46:39 2005 200 32285.80 79 161.4290 36.05 293.37 Fri Mar 25 11:47:16 2005 300 40424.15 84 134.7472 43.19 438.42 Fri Mar 25 11:47:59 2005 400 39155.01 79 97.8875 59.46 590.05 Fri Mar 25 11:48:59 2005 500 37881.25 82 75.7625 76.82 730.19 Fri Mar 25 11:50:16 2005 600 39083.14 78 65.1386 89.35 872.79 Fri Mar 25 11:51:46 2005 700 38627.83 77 55.1826 105.47 1022.46 Fri Mar 25 11:53:32 2005 800 39631.94 78 49.5399 117.48 1169.94 Fri Mar 25 11:55:30 2005 900 36903.70 79 41.0041 141.94 1310.78 Fri Mar 25 11:57:53 2005 1000 36201.23 77 36.2012 160.77 1458.31 Fri Mar 25 12:00:34 2005 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com> Signed-off-by: Shai Fultheim <Shai@Scalex86.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--drivers/base/node.c2
-rw-r--r--include/linux/mm.h6
-rw-r--r--include/linux/mmzone.h11
-rw-r--r--init/main.c1
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/page_alloc.c211
6 files changed, 195 insertions, 38 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5d4517ccc422..904b27caf697 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf)
87 for (i = 0; i < MAX_NR_ZONES; i++) { 87 for (i = 0; i < MAX_NR_ZONES; i++) {
88 struct zone *z = &pg->node_zones[i]; 88 struct zone *z = &pg->node_zones[i];
89 for (cpu = 0; cpu < NR_CPUS; cpu++) { 89 for (cpu = 0; cpu < NR_CPUS; cpu++) {
90 struct per_cpu_pageset *ps = &z->pageset[cpu]; 90 struct per_cpu_pageset *ps = zone_pcp(z,cpu);
91 numa_hit += ps->numa_hit; 91 numa_hit += ps->numa_hit;
92 numa_miss += ps->numa_miss; 92 numa_miss += ps->numa_miss;
93 numa_foreign += ps->numa_foreign; 93 numa_foreign += ps->numa_foreign;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17518fe0b311..1813b162b0a8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -691,6 +691,12 @@ extern void show_mem(void);
691extern void si_meminfo(struct sysinfo * val); 691extern void si_meminfo(struct sysinfo * val);
692extern void si_meminfo_node(struct sysinfo *val, int nid); 692extern void si_meminfo_node(struct sysinfo *val, int nid);
693 693
694#ifdef CONFIG_NUMA
695extern void setup_per_cpu_pageset(void);
696#else
697static inline void setup_per_cpu_pageset(void) {}
698#endif
699
694/* prio_tree.c */ 700/* prio_tree.c */
695void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); 701void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
696void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); 702void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 18fed8b67943..4733d35d8223 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -63,6 +63,12 @@ struct per_cpu_pageset {
63#endif 63#endif
64} ____cacheline_aligned_in_smp; 64} ____cacheline_aligned_in_smp;
65 65
66#ifdef CONFIG_NUMA
67#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
68#else
69#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
70#endif
71
66#define ZONE_DMA 0 72#define ZONE_DMA 0
67#define ZONE_NORMAL 1 73#define ZONE_NORMAL 1
68#define ZONE_HIGHMEM 2 74#define ZONE_HIGHMEM 2
@@ -122,8 +128,11 @@ struct zone {
122 */ 128 */
123 unsigned long lowmem_reserve[MAX_NR_ZONES]; 129 unsigned long lowmem_reserve[MAX_NR_ZONES];
124 130
131#ifdef CONFIG_NUMA
132 struct per_cpu_pageset *pageset[NR_CPUS];
133#else
125 struct per_cpu_pageset pageset[NR_CPUS]; 134 struct per_cpu_pageset pageset[NR_CPUS];
126 135#endif
127 /* 136 /*
128 * free areas of different sizes 137 * free areas of different sizes
129 */ 138 */
diff --git a/init/main.c b/init/main.c
index 40bf367ffdf1..d324801729ba 100644
--- a/init/main.c
+++ b/init/main.c
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void)
490 vfs_caches_init_early(); 490 vfs_caches_init_early();
491 mem_init(); 491 mem_init();
492 kmem_cache_init(); 492 kmem_cache_init();
493 setup_per_cpu_pageset();
493 numa_policy_init(); 494 numa_policy_init();
494 if (late_time_init) 495 if (late_time_init)
495 late_time_init(); 496 late_time_init();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..39252c732db2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); 721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl); 722 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) { 723 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++; 724 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
725 put_cpu(); 725 put_cpu();
726 } 726 }
727 return page; 727 return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2019c1b19254..95cbd30a67b8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages);
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74#ifdef CONFIG_NUMA
75static struct per_cpu_pageset
76 pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata;
77#endif
78
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 79static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
75int min_free_kbytes = 1024; 80int min_free_kbytes = 1024;
76 81
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu)
520 for_each_zone(zone) { 525 for_each_zone(zone) {
521 struct per_cpu_pageset *pset; 526 struct per_cpu_pageset *pset;
522 527
523 pset = &zone->pageset[cpu]; 528 pset = zone_pcp(zone, cpu);
524 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 529 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525 struct per_cpu_pages *pcp; 530 struct per_cpu_pages *pcp;
526 531
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
583 588
584 local_irq_save(flags); 589 local_irq_save(flags);
585 cpu = smp_processor_id(); 590 cpu = smp_processor_id();
586 p = &z->pageset[cpu]; 591 p = zone_pcp(z,cpu);
587 if (pg == orig) { 592 if (pg == orig) {
588 z->pageset[cpu].numa_hit++; 593 p->numa_hit++;
589 } else { 594 } else {
590 p->numa_miss++; 595 p->numa_miss++;
591 zonelist->zones[0]->pageset[cpu].numa_foreign++; 596 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592 } 597 }
593 if (pg == NODE_DATA(numa_node_id())) 598 if (pg == NODE_DATA(numa_node_id()))
594 p->local_node++; 599 p->local_node++;
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615 if (PageAnon(page)) 620 if (PageAnon(page))
616 page->mapping = NULL; 621 page->mapping = NULL;
617 free_pages_check(__FUNCTION__, page); 622 free_pages_check(__FUNCTION__, page);
618 pcp = &zone->pageset[get_cpu()].pcp[cold]; 623 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619 local_irq_save(flags); 624 local_irq_save(flags);
620 if (pcp->count >= pcp->high) 625 if (pcp->count >= pcp->high)
621 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 626 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659 if (order == 0) { 664 if (order == 0) {
660 struct per_cpu_pages *pcp; 665 struct per_cpu_pages *pcp;
661 666
662 pcp = &zone->pageset[get_cpu()].pcp[cold]; 667 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663 local_irq_save(flags); 668 local_irq_save(flags);
664 if (pcp->count <= pcp->low) 669 if (pcp->count <= pcp->low)
665 pcp->count += rmqueue_bulk(zone, 0, 670 pcp->count += rmqueue_bulk(zone, 0,
@@ -1262,7 +1267,7 @@ void show_free_areas(void)
1262 if (!cpu_possible(cpu)) 1267 if (!cpu_possible(cpu))
1263 continue; 1268 continue;
1264 1269
1265 pageset = zone->pageset + cpu; 1270 pageset = zone_pcp(zone, cpu);
1266 1271
1267 for (temperature = 0; temperature < 2; temperature++) 1272 for (temperature = 0; temperature < 2; temperature++)
1268 printk("cpu %d %s: low %d, high %d, batch %d\n", 1273 printk("cpu %d %s: low %d, high %d, batch %d\n",
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1645 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1650 memmap_init_zone((size), (nid), (zone), (start_pfn))
1646#endif 1651#endif
1647 1652
1653static int __devinit zone_batchsize(struct zone *zone)
1654{
1655 int batch;
1656
1657 /*
1658 * The per-cpu-pages pools are set to around 1000th of the
1659 * size of the zone. But no more than 1/4 of a meg - there's
1660 * no point in going beyond the size of L2 cache.
1661 *
1662 * OK, so we don't know how big the cache is. So guess.
1663 */
1664 batch = zone->present_pages / 1024;
1665 if (batch * PAGE_SIZE > 256 * 1024)
1666 batch = (256 * 1024) / PAGE_SIZE;
1667 batch /= 4; /* We effectively *= 4 below */
1668 if (batch < 1)
1669 batch = 1;
1670
1671 /*
1672 * Clamp the batch to a 2^n - 1 value. Having a power
1673 * of 2 value was found to be more likely to have
1674 * suboptimal cache aliasing properties in some cases.
1675 *
1676 * For example if 2 tasks are alternately allocating
1677 * batches of pages, one task can end up with a lot
1678 * of pages of one half of the possible page colors
1679 * and the other with pages of the other colors.
1680 */
1681 batch = (1 << fls(batch + batch/2)) - 1;
1682 return batch;
1683}
1684
1685#ifdef CONFIG_NUMA
1686/*
1687 * Dynamicaly allocate memory for the
1688 * per cpu pageset array in struct zone.
1689 */
1690static int __devinit process_zones(int cpu)
1691{
1692 struct zone *zone, *dzone;
1693 int i;
1694
1695 for_each_zone(zone) {
1696 struct per_cpu_pageset *npageset = NULL;
1697
1698 npageset = kmalloc_node(sizeof(struct per_cpu_pageset),
1699 GFP_KERNEL, cpu_to_node(cpu));
1700 if (!npageset) {
1701 zone->pageset[cpu] = NULL;
1702 goto bad;
1703 }
1704
1705 if (zone->pageset[cpu]) {
1706 memcpy(npageset, zone->pageset[cpu],
1707 sizeof(struct per_cpu_pageset));
1708
1709 /* Relocate lists */
1710 for (i = 0; i < 2; i++) {
1711 INIT_LIST_HEAD(&npageset->pcp[i].list);
1712 list_splice(&zone->pageset[cpu]->pcp[i].list,
1713 &npageset->pcp[i].list);
1714 }
1715 } else {
1716 struct per_cpu_pages *pcp;
1717 unsigned long batch;
1718
1719 batch = zone_batchsize(zone);
1720
1721 pcp = &npageset->pcp[0]; /* hot */
1722 pcp->count = 0;
1723 pcp->low = 2 * batch;
1724 pcp->high = 6 * batch;
1725 pcp->batch = 1 * batch;
1726 INIT_LIST_HEAD(&pcp->list);
1727
1728 pcp = &npageset->pcp[1]; /* cold*/
1729 pcp->count = 0;
1730 pcp->low = 0;
1731 pcp->high = 2 * batch;
1732 pcp->batch = 1 * batch;
1733 INIT_LIST_HEAD(&pcp->list);
1734 }
1735 zone->pageset[cpu] = npageset;
1736 }
1737
1738 return 0;
1739bad:
1740 for_each_zone(dzone) {
1741 if (dzone == zone)
1742 break;
1743 kfree(dzone->pageset[cpu]);
1744 dzone->pageset[cpu] = NULL;
1745 }
1746 return -ENOMEM;
1747}
1748
1749static inline void free_zone_pagesets(int cpu)
1750{
1751#ifdef CONFIG_NUMA
1752 struct zone *zone;
1753
1754 for_each_zone(zone) {
1755 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1756
1757 zone_pcp(zone, cpu) = NULL;
1758 kfree(pset);
1759 }
1760#endif
1761}
1762
1763static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1764 unsigned long action,
1765 void *hcpu)
1766{
1767 int cpu = (long)hcpu;
1768 int ret = NOTIFY_OK;
1769
1770 switch (action) {
1771 case CPU_UP_PREPARE:
1772 if (process_zones(cpu))
1773 ret = NOTIFY_BAD;
1774 break;
1775#ifdef CONFIG_HOTPLUG_CPU
1776 case CPU_DEAD:
1777 free_zone_pagesets(cpu);
1778 break;
1779#endif
1780 default:
1781 break;
1782 }
1783 return ret;
1784}
1785
1786static struct notifier_block pageset_notifier =
1787 { &pageset_cpuup_callback, NULL, 0 };
1788
1789void __init setup_per_cpu_pageset()
1790{
1791 int err;
1792
1793 /* Initialize per_cpu_pageset for cpu 0.
1794 * A cpuup callback will do this for every cpu
1795 * as it comes online
1796 */
1797 err = process_zones(smp_processor_id());
1798 BUG_ON(err);
1799 register_cpu_notifier(&pageset_notifier);
1800}
1801
1802#endif
1803
1648/* 1804/*
1649 * Set up the zone data structures: 1805 * Set up the zone data structures:
1650 * - mark all pages reserved 1806 * - mark all pages reserved
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1687 1843
1688 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1844 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1689 1845
1690 /* 1846 batch = zone_batchsize(zone);
1691 * The per-cpu-pages pools are set to around 1000th of the
1692 * size of the zone. But no more than 1/4 of a meg - there's
1693 * no point in going beyond the size of L2 cache.
1694 *
1695 * OK, so we don't know how big the cache is. So guess.
1696 */
1697 batch = zone->present_pages / 1024;
1698 if (batch * PAGE_SIZE > 256 * 1024)
1699 batch = (256 * 1024) / PAGE_SIZE;
1700 batch /= 4; /* We effectively *= 4 below */
1701 if (batch < 1)
1702 batch = 1;
1703
1704 /*
1705 * Clamp the batch to a 2^n - 1 value. Having a power
1706 * of 2 value was found to be more likely to have
1707 * suboptimal cache aliasing properties in some cases.
1708 *
1709 * For example if 2 tasks are alternately allocating
1710 * batches of pages, one task can end up with a lot
1711 * of pages of one half of the possible page colors
1712 * and the other with pages of the other colors.
1713 */
1714 batch = (1 << fls(batch + batch/2)) - 1;
1715 1847
1716 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1848 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1717 struct per_cpu_pages *pcp; 1849 struct per_cpu_pages *pcp;
1850#ifdef CONFIG_NUMA
1851 struct per_cpu_pageset *pgset;
1852 pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS +
1853 (j * NR_CPUS) + cpu];
1854
1855 zone->pageset[cpu] = pgset;
1856#else
1857 struct per_cpu_pageset *pgset = zone_pcp(zone, cpu);
1858#endif
1718 1859
1719 pcp = &zone->pageset[cpu].pcp[0]; /* hot */ 1860 pcp = &pgset->pcp[0]; /* hot */
1720 pcp->count = 0; 1861 pcp->count = 0;
1721 pcp->low = 2 * batch; 1862 pcp->low = 2 * batch;
1722 pcp->high = 6 * batch; 1863 pcp->high = 6 * batch;
1723 pcp->batch = 1 * batch; 1864 pcp->batch = 1 * batch;
1724 INIT_LIST_HEAD(&pcp->list); 1865 INIT_LIST_HEAD(&pcp->list);
1725 1866
1726 pcp = &zone->pageset[cpu].pcp[1]; /* cold */ 1867 pcp = &pgset->pcp[1]; /* cold */
1727 pcp->count = 0; 1868 pcp->count = 0;
1728 pcp->low = 0; 1869 pcp->low = 0;
1729 pcp->high = 2 * batch; 1870 pcp->high = 2 * batch;
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
1929 struct per_cpu_pageset *pageset; 2070 struct per_cpu_pageset *pageset;
1930 int j; 2071 int j;
1931 2072
1932 pageset = &zone->pageset[i]; 2073 pageset = zone_pcp(zone, i);
1933 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 2074 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
1934 if (pageset->pcp[j].count) 2075 if (pageset->pcp[j].count)
1935 break; 2076 break;