diff options
-rw-r--r-- | drivers/base/node.c | 2 | ||||
-rw-r--r-- | include/linux/mm.h | 6 | ||||
-rw-r--r-- | include/linux/mmzone.h | 11 | ||||
-rw-r--r-- | init/main.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 211 |
6 files changed, 195 insertions, 38 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c index 5d4517ccc422..904b27caf697 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -87,7 +87,7 @@ static ssize_t node_read_numastat(struct sys_device * dev, char * buf) | |||
87 | for (i = 0; i < MAX_NR_ZONES; i++) { | 87 | for (i = 0; i < MAX_NR_ZONES; i++) { |
88 | struct zone *z = &pg->node_zones[i]; | 88 | struct zone *z = &pg->node_zones[i]; |
89 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 89 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
90 | struct per_cpu_pageset *ps = &z->pageset[cpu]; | 90 | struct per_cpu_pageset *ps = zone_pcp(z,cpu); |
91 | numa_hit += ps->numa_hit; | 91 | numa_hit += ps->numa_hit; |
92 | numa_miss += ps->numa_miss; | 92 | numa_miss += ps->numa_miss; |
93 | numa_foreign += ps->numa_foreign; | 93 | numa_foreign += ps->numa_foreign; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 17518fe0b311..1813b162b0a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -691,6 +691,12 @@ extern void show_mem(void); | |||
691 | extern void si_meminfo(struct sysinfo * val); | 691 | extern void si_meminfo(struct sysinfo * val); |
692 | extern void si_meminfo_node(struct sysinfo *val, int nid); | 692 | extern void si_meminfo_node(struct sysinfo *val, int nid); |
693 | 693 | ||
694 | #ifdef CONFIG_NUMA | ||
695 | extern void setup_per_cpu_pageset(void); | ||
696 | #else | ||
697 | static inline void setup_per_cpu_pageset(void) {} | ||
698 | #endif | ||
699 | |||
694 | /* prio_tree.c */ | 700 | /* prio_tree.c */ |
695 | void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); | 701 | void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); |
696 | void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); | 702 | void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 18fed8b67943..4733d35d8223 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -63,6 +63,12 @@ struct per_cpu_pageset { | |||
63 | #endif | 63 | #endif |
64 | } ____cacheline_aligned_in_smp; | 64 | } ____cacheline_aligned_in_smp; |
65 | 65 | ||
66 | #ifdef CONFIG_NUMA | ||
67 | #define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) | ||
68 | #else | ||
69 | #define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) | ||
70 | #endif | ||
71 | |||
66 | #define ZONE_DMA 0 | 72 | #define ZONE_DMA 0 |
67 | #define ZONE_NORMAL 1 | 73 | #define ZONE_NORMAL 1 |
68 | #define ZONE_HIGHMEM 2 | 74 | #define ZONE_HIGHMEM 2 |
@@ -122,8 +128,11 @@ struct zone { | |||
122 | */ | 128 | */ |
123 | unsigned long lowmem_reserve[MAX_NR_ZONES]; | 129 | unsigned long lowmem_reserve[MAX_NR_ZONES]; |
124 | 130 | ||
131 | #ifdef CONFIG_NUMA | ||
132 | struct per_cpu_pageset *pageset[NR_CPUS]; | ||
133 | #else | ||
125 | struct per_cpu_pageset pageset[NR_CPUS]; | 134 | struct per_cpu_pageset pageset[NR_CPUS]; |
126 | 135 | #endif | |
127 | /* | 136 | /* |
128 | * free areas of different sizes | 137 | * free areas of different sizes |
129 | */ | 138 | */ |
diff --git a/init/main.c b/init/main.c index 40bf367ffdf1..d324801729ba 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -490,6 +490,7 @@ asmlinkage void __init start_kernel(void) | |||
490 | vfs_caches_init_early(); | 490 | vfs_caches_init_early(); |
491 | mem_init(); | 491 | mem_init(); |
492 | kmem_cache_init(); | 492 | kmem_cache_init(); |
493 | setup_per_cpu_pageset(); | ||
493 | numa_policy_init(); | 494 | numa_policy_init(); |
494 | if (late_time_init) | 495 | if (late_time_init) |
495 | late_time_init(); | 496 | late_time_init(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429cf..39252c732db2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -721,7 +721,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or | |||
721 | zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); | 721 | zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); |
722 | page = __alloc_pages(gfp, order, zl); | 722 | page = __alloc_pages(gfp, order, zl); |
723 | if (page && page_zone(page) == zl->zones[0]) { | 723 | if (page && page_zone(page) == zl->zones[0]) { |
724 | zl->zones[0]->pageset[get_cpu()].interleave_hit++; | 724 | zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; |
725 | put_cpu(); | 725 | put_cpu(); |
726 | } | 726 | } |
727 | return page; | 727 | return page; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2019c1b19254..95cbd30a67b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -71,6 +71,11 @@ EXPORT_SYMBOL(nr_swap_pages); | |||
71 | struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; | 71 | struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; |
72 | EXPORT_SYMBOL(zone_table); | 72 | EXPORT_SYMBOL(zone_table); |
73 | 73 | ||
74 | #ifdef CONFIG_NUMA | ||
75 | static struct per_cpu_pageset | ||
76 | pageset_table[MAX_NR_ZONES*MAX_NUMNODES*NR_CPUS] __initdata; | ||
77 | #endif | ||
78 | |||
74 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | 79 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; |
75 | int min_free_kbytes = 1024; | 80 | int min_free_kbytes = 1024; |
76 | 81 | ||
@@ -520,7 +525,7 @@ static void __drain_pages(unsigned int cpu) | |||
520 | for_each_zone(zone) { | 525 | for_each_zone(zone) { |
521 | struct per_cpu_pageset *pset; | 526 | struct per_cpu_pageset *pset; |
522 | 527 | ||
523 | pset = &zone->pageset[cpu]; | 528 | pset = zone_pcp(zone, cpu); |
524 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 529 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
525 | struct per_cpu_pages *pcp; | 530 | struct per_cpu_pages *pcp; |
526 | 531 | ||
@@ -583,12 +588,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
583 | 588 | ||
584 | local_irq_save(flags); | 589 | local_irq_save(flags); |
585 | cpu = smp_processor_id(); | 590 | cpu = smp_processor_id(); |
586 | p = &z->pageset[cpu]; | 591 | p = zone_pcp(z,cpu); |
587 | if (pg == orig) { | 592 | if (pg == orig) { |
588 | z->pageset[cpu].numa_hit++; | 593 | p->numa_hit++; |
589 | } else { | 594 | } else { |
590 | p->numa_miss++; | 595 | p->numa_miss++; |
591 | zonelist->zones[0]->pageset[cpu].numa_foreign++; | 596 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; |
592 | } | 597 | } |
593 | if (pg == NODE_DATA(numa_node_id())) | 598 | if (pg == NODE_DATA(numa_node_id())) |
594 | p->local_node++; | 599 | p->local_node++; |
@@ -615,7 +620,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
615 | if (PageAnon(page)) | 620 | if (PageAnon(page)) |
616 | page->mapping = NULL; | 621 | page->mapping = NULL; |
617 | free_pages_check(__FUNCTION__, page); | 622 | free_pages_check(__FUNCTION__, page); |
618 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 623 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
619 | local_irq_save(flags); | 624 | local_irq_save(flags); |
620 | if (pcp->count >= pcp->high) | 625 | if (pcp->count >= pcp->high) |
621 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 626 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -659,7 +664,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) | |||
659 | if (order == 0) { | 664 | if (order == 0) { |
660 | struct per_cpu_pages *pcp; | 665 | struct per_cpu_pages *pcp; |
661 | 666 | ||
662 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 667 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
663 | local_irq_save(flags); | 668 | local_irq_save(flags); |
664 | if (pcp->count <= pcp->low) | 669 | if (pcp->count <= pcp->low) |
665 | pcp->count += rmqueue_bulk(zone, 0, | 670 | pcp->count += rmqueue_bulk(zone, 0, |
@@ -1262,7 +1267,7 @@ void show_free_areas(void) | |||
1262 | if (!cpu_possible(cpu)) | 1267 | if (!cpu_possible(cpu)) |
1263 | continue; | 1268 | continue; |
1264 | 1269 | ||
1265 | pageset = zone->pageset + cpu; | 1270 | pageset = zone_pcp(zone, cpu); |
1266 | 1271 | ||
1267 | for (temperature = 0; temperature < 2; temperature++) | 1272 | for (temperature = 0; temperature < 2; temperature++) |
1268 | printk("cpu %d %s: low %d, high %d, batch %d\n", | 1273 | printk("cpu %d %s: low %d, high %d, batch %d\n", |
@@ -1645,6 +1650,157 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1645 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1650 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
1646 | #endif | 1651 | #endif |
1647 | 1652 | ||
1653 | static int __devinit zone_batchsize(struct zone *zone) | ||
1654 | { | ||
1655 | int batch; | ||
1656 | |||
1657 | /* | ||
1658 | * The per-cpu-pages pools are set to around 1000th of the | ||
1659 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1660 | * no point in going beyond the size of L2 cache. | ||
1661 | * | ||
1662 | * OK, so we don't know how big the cache is. So guess. | ||
1663 | */ | ||
1664 | batch = zone->present_pages / 1024; | ||
1665 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1666 | batch = (256 * 1024) / PAGE_SIZE; | ||
1667 | batch /= 4; /* We effectively *= 4 below */ | ||
1668 | if (batch < 1) | ||
1669 | batch = 1; | ||
1670 | |||
1671 | /* | ||
1672 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1673 | * of 2 value was found to be more likely to have | ||
1674 | * suboptimal cache aliasing properties in some cases. | ||
1675 | * | ||
1676 | * For example if 2 tasks are alternately allocating | ||
1677 | * batches of pages, one task can end up with a lot | ||
1678 | * of pages of one half of the possible page colors | ||
1679 | * and the other with pages of the other colors. | ||
1680 | */ | ||
1681 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1682 | return batch; | ||
1683 | } | ||
1684 | |||
1685 | #ifdef CONFIG_NUMA | ||
1686 | /* | ||
1687 | * Dynamicaly allocate memory for the | ||
1688 | * per cpu pageset array in struct zone. | ||
1689 | */ | ||
1690 | static int __devinit process_zones(int cpu) | ||
1691 | { | ||
1692 | struct zone *zone, *dzone; | ||
1693 | int i; | ||
1694 | |||
1695 | for_each_zone(zone) { | ||
1696 | struct per_cpu_pageset *npageset = NULL; | ||
1697 | |||
1698 | npageset = kmalloc_node(sizeof(struct per_cpu_pageset), | ||
1699 | GFP_KERNEL, cpu_to_node(cpu)); | ||
1700 | if (!npageset) { | ||
1701 | zone->pageset[cpu] = NULL; | ||
1702 | goto bad; | ||
1703 | } | ||
1704 | |||
1705 | if (zone->pageset[cpu]) { | ||
1706 | memcpy(npageset, zone->pageset[cpu], | ||
1707 | sizeof(struct per_cpu_pageset)); | ||
1708 | |||
1709 | /* Relocate lists */ | ||
1710 | for (i = 0; i < 2; i++) { | ||
1711 | INIT_LIST_HEAD(&npageset->pcp[i].list); | ||
1712 | list_splice(&zone->pageset[cpu]->pcp[i].list, | ||
1713 | &npageset->pcp[i].list); | ||
1714 | } | ||
1715 | } else { | ||
1716 | struct per_cpu_pages *pcp; | ||
1717 | unsigned long batch; | ||
1718 | |||
1719 | batch = zone_batchsize(zone); | ||
1720 | |||
1721 | pcp = &npageset->pcp[0]; /* hot */ | ||
1722 | pcp->count = 0; | ||
1723 | pcp->low = 2 * batch; | ||
1724 | pcp->high = 6 * batch; | ||
1725 | pcp->batch = 1 * batch; | ||
1726 | INIT_LIST_HEAD(&pcp->list); | ||
1727 | |||
1728 | pcp = &npageset->pcp[1]; /* cold*/ | ||
1729 | pcp->count = 0; | ||
1730 | pcp->low = 0; | ||
1731 | pcp->high = 2 * batch; | ||
1732 | pcp->batch = 1 * batch; | ||
1733 | INIT_LIST_HEAD(&pcp->list); | ||
1734 | } | ||
1735 | zone->pageset[cpu] = npageset; | ||
1736 | } | ||
1737 | |||
1738 | return 0; | ||
1739 | bad: | ||
1740 | for_each_zone(dzone) { | ||
1741 | if (dzone == zone) | ||
1742 | break; | ||
1743 | kfree(dzone->pageset[cpu]); | ||
1744 | dzone->pageset[cpu] = NULL; | ||
1745 | } | ||
1746 | return -ENOMEM; | ||
1747 | } | ||
1748 | |||
1749 | static inline void free_zone_pagesets(int cpu) | ||
1750 | { | ||
1751 | #ifdef CONFIG_NUMA | ||
1752 | struct zone *zone; | ||
1753 | |||
1754 | for_each_zone(zone) { | ||
1755 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
1756 | |||
1757 | zone_pcp(zone, cpu) = NULL; | ||
1758 | kfree(pset); | ||
1759 | } | ||
1760 | #endif | ||
1761 | } | ||
1762 | |||
1763 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
1764 | unsigned long action, | ||
1765 | void *hcpu) | ||
1766 | { | ||
1767 | int cpu = (long)hcpu; | ||
1768 | int ret = NOTIFY_OK; | ||
1769 | |||
1770 | switch (action) { | ||
1771 | case CPU_UP_PREPARE: | ||
1772 | if (process_zones(cpu)) | ||
1773 | ret = NOTIFY_BAD; | ||
1774 | break; | ||
1775 | #ifdef CONFIG_HOTPLUG_CPU | ||
1776 | case CPU_DEAD: | ||
1777 | free_zone_pagesets(cpu); | ||
1778 | break; | ||
1779 | #endif | ||
1780 | default: | ||
1781 | break; | ||
1782 | } | ||
1783 | return ret; | ||
1784 | } | ||
1785 | |||
1786 | static struct notifier_block pageset_notifier = | ||
1787 | { &pageset_cpuup_callback, NULL, 0 }; | ||
1788 | |||
1789 | void __init setup_per_cpu_pageset() | ||
1790 | { | ||
1791 | int err; | ||
1792 | |||
1793 | /* Initialize per_cpu_pageset for cpu 0. | ||
1794 | * A cpuup callback will do this for every cpu | ||
1795 | * as it comes online | ||
1796 | */ | ||
1797 | err = process_zones(smp_processor_id()); | ||
1798 | BUG_ON(err); | ||
1799 | register_cpu_notifier(&pageset_notifier); | ||
1800 | } | ||
1801 | |||
1802 | #endif | ||
1803 | |||
1648 | /* | 1804 | /* |
1649 | * Set up the zone data structures: | 1805 | * Set up the zone data structures: |
1650 | * - mark all pages reserved | 1806 | * - mark all pages reserved |
@@ -1687,43 +1843,28 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1687 | 1843 | ||
1688 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1844 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
1689 | 1845 | ||
1690 | /* | 1846 | batch = zone_batchsize(zone); |
1691 | * The per-cpu-pages pools are set to around 1000th of the | ||
1692 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1693 | * no point in going beyond the size of L2 cache. | ||
1694 | * | ||
1695 | * OK, so we don't know how big the cache is. So guess. | ||
1696 | */ | ||
1697 | batch = zone->present_pages / 1024; | ||
1698 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1699 | batch = (256 * 1024) / PAGE_SIZE; | ||
1700 | batch /= 4; /* We effectively *= 4 below */ | ||
1701 | if (batch < 1) | ||
1702 | batch = 1; | ||
1703 | |||
1704 | /* | ||
1705 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1706 | * of 2 value was found to be more likely to have | ||
1707 | * suboptimal cache aliasing properties in some cases. | ||
1708 | * | ||
1709 | * For example if 2 tasks are alternately allocating | ||
1710 | * batches of pages, one task can end up with a lot | ||
1711 | * of pages of one half of the possible page colors | ||
1712 | * and the other with pages of the other colors. | ||
1713 | */ | ||
1714 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1715 | 1847 | ||
1716 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1848 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1717 | struct per_cpu_pages *pcp; | 1849 | struct per_cpu_pages *pcp; |
1850 | #ifdef CONFIG_NUMA | ||
1851 | struct per_cpu_pageset *pgset; | ||
1852 | pgset = &pageset_table[nid*MAX_NR_ZONES*NR_CPUS + | ||
1853 | (j * NR_CPUS) + cpu]; | ||
1854 | |||
1855 | zone->pageset[cpu] = pgset; | ||
1856 | #else | ||
1857 | struct per_cpu_pageset *pgset = zone_pcp(zone, cpu); | ||
1858 | #endif | ||
1718 | 1859 | ||
1719 | pcp = &zone->pageset[cpu].pcp[0]; /* hot */ | 1860 | pcp = &pgset->pcp[0]; /* hot */ |
1720 | pcp->count = 0; | 1861 | pcp->count = 0; |
1721 | pcp->low = 2 * batch; | 1862 | pcp->low = 2 * batch; |
1722 | pcp->high = 6 * batch; | 1863 | pcp->high = 6 * batch; |
1723 | pcp->batch = 1 * batch; | 1864 | pcp->batch = 1 * batch; |
1724 | INIT_LIST_HEAD(&pcp->list); | 1865 | INIT_LIST_HEAD(&pcp->list); |
1725 | 1866 | ||
1726 | pcp = &zone->pageset[cpu].pcp[1]; /* cold */ | 1867 | pcp = &pgset->pcp[1]; /* cold */ |
1727 | pcp->count = 0; | 1868 | pcp->count = 0; |
1728 | pcp->low = 0; | 1869 | pcp->low = 0; |
1729 | pcp->high = 2 * batch; | 1870 | pcp->high = 2 * batch; |
@@ -1929,7 +2070,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
1929 | struct per_cpu_pageset *pageset; | 2070 | struct per_cpu_pageset *pageset; |
1930 | int j; | 2071 | int j; |
1931 | 2072 | ||
1932 | pageset = &zone->pageset[i]; | 2073 | pageset = zone_pcp(zone, i); |
1933 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | 2074 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { |
1934 | if (pageset->pcp[j].count) | 2075 | if (pageset->pcp[j].count) |
1935 | break; | 2076 | break; |