aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorJeff Garzik <jgarzik@pretzel.yyz.us>2005-06-26 23:38:58 -0400
committerJeff Garzik <jgarzik@pobox.com>2005-06-26 23:38:58 -0400
commit5696c1944a33b4434a9a1ebb6383b906afd43a10 (patch)
tree16fbe6ba431bcf949ee8645510b0c2fd39b5810f /mm/page_alloc.c
parent66b04a80eea60cabf9d89fd34deb3234a740052f (diff)
parent020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff)
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c481
1 files changed, 402 insertions, 79 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..7ee675ad101e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
68 * Used by page_zone() to look up the address of the struct zone whose 68 * Used by page_zone() to look up the address of the struct zone whose
69 * id is encoded in the upper bits of page->flags 69 * id is encoded in the upper bits of page->flags
70 */ 70 */
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << ZONETABLE_SHIFT];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
105 printk(KERN_EMERG "Backtrace:\n"); 105 printk(KERN_EMERG "Backtrace:\n");
106 dump_stack(); 106 dump_stack();
107 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 107 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
108 page->flags &= ~(1 << PG_private | 108 page->flags &= ~(1 << PG_lru |
109 1 << PG_private |
109 1 << PG_locked | 110 1 << PG_locked |
110 1 << PG_lru |
111 1 << PG_active | 111 1 << PG_active |
112 1 << PG_dirty | 112 1 << PG_dirty |
113 1 << PG_reclaim |
114 1 << PG_slab |
113 1 << PG_swapcache | 115 1 << PG_swapcache |
114 1 << PG_writeback); 116 1 << PG_writeback);
115 set_page_count(page, 0); 117 set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
440 */ 442 */
441static void prep_new_page(struct page *page, int order) 443static void prep_new_page(struct page *page, int order)
442{ 444{
443 if (page->mapping || page_mapcount(page) || 445 if ( page_mapcount(page) ||
444 (page->flags & ( 446 page->mapping != NULL ||
447 page_count(page) != 0 ||
448 (page->flags & (
449 1 << PG_lru |
445 1 << PG_private | 450 1 << PG_private |
446 1 << PG_locked | 451 1 << PG_locked |
447 1 << PG_lru |
448 1 << PG_active | 452 1 << PG_active |
449 1 << PG_dirty | 453 1 << PG_dirty |
450 1 << PG_reclaim | 454 1 << PG_reclaim |
455 1 << PG_slab |
451 1 << PG_swapcache | 456 1 << PG_swapcache |
452 1 << PG_writeback ))) 457 1 << PG_writeback )))
453 bad_page(__FUNCTION__, page); 458 bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
511 return allocated; 516 return allocated;
512} 517}
513 518
519#ifdef CONFIG_NUMA
520/* Called from the slab reaper to drain remote pagesets */
521void drain_remote_pages(void)
522{
523 struct zone *zone;
524 int i;
525 unsigned long flags;
526
527 local_irq_save(flags);
528 for_each_zone(zone) {
529 struct per_cpu_pageset *pset;
530
531 /* Do not drain local pagesets */
532 if (zone->zone_pgdat->node_id == numa_node_id())
533 continue;
534
535 pset = zone->pageset[smp_processor_id()];
536 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
537 struct per_cpu_pages *pcp;
538
539 pcp = &pset->pcp[i];
540 if (pcp->count)
541 pcp->count -= free_pages_bulk(zone, pcp->count,
542 &pcp->list, 0);
543 }
544 }
545 local_irq_restore(flags);
546}
547#endif
548
514#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 549#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
515static void __drain_pages(unsigned int cpu) 550static void __drain_pages(unsigned int cpu)
516{ 551{
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
520 for_each_zone(zone) { 555 for_each_zone(zone) {
521 struct per_cpu_pageset *pset; 556 struct per_cpu_pageset *pset;
522 557
523 pset = &zone->pageset[cpu]; 558 pset = zone_pcp(zone, cpu);
524 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 559 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525 struct per_cpu_pages *pcp; 560 struct per_cpu_pages *pcp;
526 561
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
583 618
584 local_irq_save(flags); 619 local_irq_save(flags);
585 cpu = smp_processor_id(); 620 cpu = smp_processor_id();
586 p = &z->pageset[cpu]; 621 p = zone_pcp(z,cpu);
587 if (pg == orig) { 622 if (pg == orig) {
588 z->pageset[cpu].numa_hit++; 623 p->numa_hit++;
589 } else { 624 } else {
590 p->numa_miss++; 625 p->numa_miss++;
591 zonelist->zones[0]->pageset[cpu].numa_foreign++; 626 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592 } 627 }
593 if (pg == NODE_DATA(numa_node_id())) 628 if (pg == NODE_DATA(numa_node_id()))
594 p->local_node++; 629 p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615 if (PageAnon(page)) 650 if (PageAnon(page))
616 page->mapping = NULL; 651 page->mapping = NULL;
617 free_pages_check(__FUNCTION__, page); 652 free_pages_check(__FUNCTION__, page);
618 pcp = &zone->pageset[get_cpu()].pcp[cold]; 653 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619 local_irq_save(flags); 654 local_irq_save(flags);
620 if (pcp->count >= pcp->high)
621 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
622 list_add(&page->lru, &pcp->list); 655 list_add(&page->lru, &pcp->list);
623 pcp->count++; 656 pcp->count++;
657 if (pcp->count >= pcp->high)
658 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
624 local_irq_restore(flags); 659 local_irq_restore(flags);
625 put_cpu(); 660 put_cpu();
626} 661}
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659 if (order == 0) { 694 if (order == 0) {
660 struct per_cpu_pages *pcp; 695 struct per_cpu_pages *pcp;
661 696
662 pcp = &zone->pageset[get_cpu()].pcp[cold]; 697 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663 local_irq_save(flags); 698 local_irq_save(flags);
664 if (pcp->count <= pcp->low) 699 if (pcp->count <= pcp->low)
665 pcp->count += rmqueue_bulk(zone, 0, 700 pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724 return 1; 759 return 1;
725} 760}
726 761
762static inline int
763should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
764{
765 if (!z->reclaim_pages)
766 return 0;
767 if (gfp_mask & __GFP_NORECLAIM)
768 return 0;
769 return 1;
770}
771
727/* 772/*
728 * This is the 'heart' of the zoned buddy allocator. 773 * This is the 'heart' of the zoned buddy allocator.
729 */ 774 */
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
760 805
761 classzone_idx = zone_idx(zones[0]); 806 classzone_idx = zone_idx(zones[0]);
762 807
763 restart: 808restart:
764 /* Go through the zonelist once, looking for a zone with enough free */ 809 /* Go through the zonelist once, looking for a zone with enough free */
765 for (i = 0; (z = zones[i]) != NULL; i++) { 810 for (i = 0; (z = zones[i]) != NULL; i++) {
766 811 int do_reclaim = should_reclaim_zone(z, gfp_mask);
767 if (!zone_watermark_ok(z, order, z->pages_low,
768 classzone_idx, 0, 0))
769 continue;
770 812
771 if (!cpuset_zone_allowed(z)) 813 if (!cpuset_zone_allowed(z))
772 continue; 814 continue;
773 815
816 /*
817 * If the zone is to attempt early page reclaim then this loop
818 * will try to reclaim pages and check the watermark a second
819 * time before giving up and falling back to the next zone.
820 */
821zone_reclaim_retry:
822 if (!zone_watermark_ok(z, order, z->pages_low,
823 classzone_idx, 0, 0)) {
824 if (!do_reclaim)
825 continue;
826 else {
827 zone_reclaim(z, gfp_mask, order);
828 /* Only try reclaim once */
829 do_reclaim = 0;
830 goto zone_reclaim_retry;
831 }
832 }
833
774 page = buffered_rmqueue(z, order, gfp_mask); 834 page = buffered_rmqueue(z, order, gfp_mask);
775 if (page) 835 if (page)
776 goto got_pg; 836 goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
829 reclaim_state.reclaimed_slab = 0; 889 reclaim_state.reclaimed_slab = 0;
830 p->reclaim_state = &reclaim_state; 890 p->reclaim_state = &reclaim_state;
831 891
832 did_some_progress = try_to_free_pages(zones, gfp_mask, order); 892 did_some_progress = try_to_free_pages(zones, gfp_mask);
833 893
834 p->reclaim_state = NULL; 894 p->reclaim_state = NULL;
835 p->flags &= ~PF_MEMALLOC; 895 p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
905 " order:%d, mode:0x%x\n", 965 " order:%d, mode:0x%x\n",
906 p->comm, order, gfp_mask); 966 p->comm, order, gfp_mask);
907 dump_stack(); 967 dump_stack();
968 show_mem();
908 } 969 }
909 return NULL; 970 return NULL;
910got_pg: 971got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
1114 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); 1175 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
1115} 1176}
1116 1177
1117unsigned long __read_page_state(unsigned offset) 1178unsigned long __read_page_state(unsigned long offset)
1118{ 1179{
1119 unsigned long ret = 0; 1180 unsigned long ret = 0;
1120 int cpu; 1181 int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
1128 return ret; 1189 return ret;
1129} 1190}
1130 1191
1131void __mod_page_state(unsigned offset, unsigned long delta) 1192void __mod_page_state(unsigned long offset, unsigned long delta)
1132{ 1193{
1133 unsigned long flags; 1194 unsigned long flags;
1134 void* ptr; 1195 void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
1237 if (!cpu_possible(cpu)) 1298 if (!cpu_possible(cpu))
1238 continue; 1299 continue;
1239 1300
1240 pageset = zone->pageset + cpu; 1301 pageset = zone_pcp(zone, cpu);
1241 1302
1242 for (temperature = 0; temperature < 2; temperature++) 1303 for (temperature = 0; temperature < 2; temperature++)
1243 printk("cpu %d %s: low %d, high %d, batch %d\n", 1304 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
1244 cpu, 1305 cpu,
1245 temperature ? "cold" : "hot", 1306 temperature ? "cold" : "hot",
1246 pageset->pcp[temperature].low, 1307 pageset->pcp[temperature].low,
1247 pageset->pcp[temperature].high, 1308 pageset->pcp[temperature].high,
1248 pageset->pcp[temperature].batch); 1309 pageset->pcp[temperature].batch,
1310 pageset->pcp[temperature].count);
1249 } 1311 }
1250 } 1312 }
1251 1313
1252 get_page_state(&ps); 1314 get_page_state(&ps);
1253 get_zone_counts(&active, &inactive, &free); 1315 get_zone_counts(&active, &inactive, &free);
1254 1316
1255 printk("\nFree pages: %11ukB (%ukB HighMem)\n", 1317 printk("Free pages: %11ukB (%ukB HighMem)\n",
1256 K(nr_free_pages()), 1318 K(nr_free_pages()),
1257 K(nr_free_highpages())); 1319 K(nr_free_highpages()));
1258 1320
@@ -1587,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1587void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1588 unsigned long start_pfn) 1650 unsigned long start_pfn)
1589{ 1651{
1590 struct page *start = pfn_to_page(start_pfn);
1591 struct page *page; 1652 struct page *page;
1653 unsigned long end_pfn = start_pfn + size;
1654 unsigned long pfn;
1592 1655
1593 for (page = start; page < (start + size); page++) { 1656 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1594 set_page_zone(page, NODEZONE(nid, zone)); 1657 if (!early_pfn_valid(pfn))
1658 continue;
1659 if (!early_pfn_in_nid(pfn, nid))
1660 continue;
1661 page = pfn_to_page(pfn);
1662 set_page_links(page, zone, nid, pfn);
1595 set_page_count(page, 0); 1663 set_page_count(page, 0);
1596 reset_page_mapcount(page); 1664 reset_page_mapcount(page);
1597 SetPageReserved(page); 1665 SetPageReserved(page);
@@ -1615,11 +1683,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1615 } 1683 }
1616} 1684}
1617 1685
1686#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1687void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1688 unsigned long size)
1689{
1690 unsigned long snum = pfn_to_section_nr(pfn);
1691 unsigned long end = pfn_to_section_nr(pfn + size);
1692
1693 if (FLAGS_HAS_NODE)
1694 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1695 else
1696 for (; snum <= end; snum++)
1697 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1698}
1699
1618#ifndef __HAVE_ARCH_MEMMAP_INIT 1700#ifndef __HAVE_ARCH_MEMMAP_INIT
1619#define memmap_init(size, nid, zone, start_pfn) \ 1701#define memmap_init(size, nid, zone, start_pfn) \
1620 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1702 memmap_init_zone((size), (nid), (zone), (start_pfn))
1621#endif 1703#endif
1622 1704
1705static int __devinit zone_batchsize(struct zone *zone)
1706{
1707 int batch;
1708
1709 /*
1710 * The per-cpu-pages pools are set to around 1000th of the
1711 * size of the zone. But no more than 1/4 of a meg - there's
1712 * no point in going beyond the size of L2 cache.
1713 *
1714 * OK, so we don't know how big the cache is. So guess.
1715 */
1716 batch = zone->present_pages / 1024;
1717 if (batch * PAGE_SIZE > 256 * 1024)
1718 batch = (256 * 1024) / PAGE_SIZE;
1719 batch /= 4; /* We effectively *= 4 below */
1720 if (batch < 1)
1721 batch = 1;
1722
1723 /*
1724 * Clamp the batch to a 2^n - 1 value. Having a power
1725 * of 2 value was found to be more likely to have
1726 * suboptimal cache aliasing properties in some cases.
1727 *
1728 * For example if 2 tasks are alternately allocating
1729 * batches of pages, one task can end up with a lot
1730 * of pages of one half of the possible page colors
1731 * and the other with pages of the other colors.
1732 */
1733 batch = (1 << fls(batch + batch/2)) - 1;
1734 return batch;
1735}
1736
1737inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1738{
1739 struct per_cpu_pages *pcp;
1740
1741 pcp = &p->pcp[0]; /* hot */
1742 pcp->count = 0;
1743 pcp->low = 2 * batch;
1744 pcp->high = 6 * batch;
1745 pcp->batch = max(1UL, 1 * batch);
1746 INIT_LIST_HEAD(&pcp->list);
1747
1748 pcp = &p->pcp[1]; /* cold*/
1749 pcp->count = 0;
1750 pcp->low = 0;
1751 pcp->high = 2 * batch;
1752 pcp->batch = max(1UL, 1 * batch);
1753 INIT_LIST_HEAD(&pcp->list);
1754}
1755
1756#ifdef CONFIG_NUMA
1757/*
1758 * Boot pageset table. One per cpu which is going to be used for all
1759 * zones and all nodes. The parameters will be set in such a way
1760 * that an item put on a list will immediately be handed over to
1761 * the buddy list. This is safe since pageset manipulation is done
1762 * with interrupts disabled.
1763 *
1764 * Some NUMA counter updates may also be caught by the boot pagesets.
1765 *
1766 * The boot_pagesets must be kept even after bootup is complete for
1767 * unused processors and/or zones. They do play a role for bootstrapping
1768 * hotplugged processors.
1769 *
1770 * zoneinfo_show() and maybe other functions do
1771 * not check if the processor is online before following the pageset pointer.
1772 * Other parts of the kernel may not check if the zone is available.
1773 */
1774static struct per_cpu_pageset
1775 boot_pageset[NR_CPUS];
1776
1777/*
1778 * Dynamically allocate memory for the
1779 * per cpu pageset array in struct zone.
1780 */
1781static int __devinit process_zones(int cpu)
1782{
1783 struct zone *zone, *dzone;
1784
1785 for_each_zone(zone) {
1786
1787 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
1788 GFP_KERNEL, cpu_to_node(cpu));
1789 if (!zone->pageset[cpu])
1790 goto bad;
1791
1792 setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
1793 }
1794
1795 return 0;
1796bad:
1797 for_each_zone(dzone) {
1798 if (dzone == zone)
1799 break;
1800 kfree(dzone->pageset[cpu]);
1801 dzone->pageset[cpu] = NULL;
1802 }
1803 return -ENOMEM;
1804}
1805
1806static inline void free_zone_pagesets(int cpu)
1807{
1808#ifdef CONFIG_NUMA
1809 struct zone *zone;
1810
1811 for_each_zone(zone) {
1812 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1813
1814 zone_pcp(zone, cpu) = NULL;
1815 kfree(pset);
1816 }
1817#endif
1818}
1819
1820static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1821 unsigned long action,
1822 void *hcpu)
1823{
1824 int cpu = (long)hcpu;
1825 int ret = NOTIFY_OK;
1826
1827 switch (action) {
1828 case CPU_UP_PREPARE:
1829 if (process_zones(cpu))
1830 ret = NOTIFY_BAD;
1831 break;
1832#ifdef CONFIG_HOTPLUG_CPU
1833 case CPU_DEAD:
1834 free_zone_pagesets(cpu);
1835 break;
1836#endif
1837 default:
1838 break;
1839 }
1840 return ret;
1841}
1842
1843static struct notifier_block pageset_notifier =
1844 { &pageset_cpuup_callback, NULL, 0 };
1845
1846void __init setup_per_cpu_pageset()
1847{
1848 int err;
1849
1850 /* Initialize per_cpu_pageset for cpu 0.
1851 * A cpuup callback will do this for every cpu
1852 * as it comes online
1853 */
1854 err = process_zones(smp_processor_id());
1855 BUG_ON(err);
1856 register_cpu_notifier(&pageset_notifier);
1857}
1858
1859#endif
1860
1623/* 1861/*
1624 * Set up the zone data structures: 1862 * Set up the zone data structures:
1625 * - mark all pages reserved 1863 * - mark all pages reserved
@@ -1643,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1643 unsigned long size, realsize; 1881 unsigned long size, realsize;
1644 unsigned long batch; 1882 unsigned long batch;
1645 1883
1646 zone_table[NODEZONE(nid, j)] = zone;
1647 realsize = size = zones_size[j]; 1884 realsize = size = zones_size[j];
1648 if (zholes_size) 1885 if (zholes_size)
1649 realsize -= zholes_size[j]; 1886 realsize -= zholes_size[j];
@@ -1662,48 +1899,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1662 1899
1663 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1900 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1664 1901
1665 /* 1902 batch = zone_batchsize(zone);
1666 * The per-cpu-pages pools are set to around 1000th of the
1667 * size of the zone. But no more than 1/4 of a meg - there's
1668 * no point in going beyond the size of L2 cache.
1669 *
1670 * OK, so we don't know how big the cache is. So guess.
1671 */
1672 batch = zone->present_pages / 1024;
1673 if (batch * PAGE_SIZE > 256 * 1024)
1674 batch = (256 * 1024) / PAGE_SIZE;
1675 batch /= 4; /* We effectively *= 4 below */
1676 if (batch < 1)
1677 batch = 1;
1678
1679 /*
1680 * Clamp the batch to a 2^n - 1 value. Having a power
1681 * of 2 value was found to be more likely to have
1682 * suboptimal cache aliasing properties in some cases.
1683 *
1684 * For example if 2 tasks are alternately allocating
1685 * batches of pages, one task can end up with a lot
1686 * of pages of one half of the possible page colors
1687 * and the other with pages of the other colors.
1688 */
1689 batch = (1 << fls(batch + batch/2)) - 1;
1690 1903
1691 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1904 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1692 struct per_cpu_pages *pcp; 1905#ifdef CONFIG_NUMA
1693 1906 /* Early boot. Slab allocator not functional yet */
1694 pcp = &zone->pageset[cpu].pcp[0]; /* hot */ 1907 zone->pageset[cpu] = &boot_pageset[cpu];
1695 pcp->count = 0; 1908 setup_pageset(&boot_pageset[cpu],0);
1696 pcp->low = 2 * batch; 1909#else
1697 pcp->high = 6 * batch; 1910 setup_pageset(zone_pcp(zone,cpu), batch);
1698 pcp->batch = 1 * batch; 1911#endif
1699 INIT_LIST_HEAD(&pcp->list);
1700
1701 pcp = &zone->pageset[cpu].pcp[1]; /* cold */
1702 pcp->count = 0;
1703 pcp->low = 0;
1704 pcp->high = 2 * batch;
1705 pcp->batch = 1 * batch;
1706 INIT_LIST_HEAD(&pcp->list);
1707 } 1912 }
1708 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1913 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1709 zone_names[j], realsize, batch); 1914 zone_names[j], realsize, batch);
@@ -1713,6 +1918,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1713 zone->nr_scan_inactive = 0; 1918 zone->nr_scan_inactive = 0;
1714 zone->nr_active = 0; 1919 zone->nr_active = 0;
1715 zone->nr_inactive = 0; 1920 zone->nr_inactive = 0;
1921 atomic_set(&zone->reclaim_in_progress, -1);
1716 if (!size) 1922 if (!size)
1717 continue; 1923 continue;
1718 1924
@@ -1740,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1740 1946
1741 memmap_init(size, nid, j, zone_start_pfn); 1947 memmap_init(size, nid, j, zone_start_pfn);
1742 1948
1949 zonetable_add(zone, nid, j, zone_start_pfn, size);
1950
1743 zone_start_pfn += size; 1951 zone_start_pfn += size;
1744 1952
1745 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1953 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1748,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1748 1956
1749static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1957static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1750{ 1958{
1751 unsigned long size;
1752
1753 /* Skip empty nodes */ 1959 /* Skip empty nodes */
1754 if (!pgdat->node_spanned_pages) 1960 if (!pgdat->node_spanned_pages)
1755 return; 1961 return;
1756 1962
1963#ifdef CONFIG_FLAT_NODE_MEM_MAP
1757 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1964 /* ia64 gets its own node_mem_map, before this, without bootmem */
1758 if (!pgdat->node_mem_map) { 1965 if (!pgdat->node_mem_map) {
1966 unsigned long size;
1967 struct page *map;
1968
1759 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1969 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1760 pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); 1970 map = alloc_remap(pgdat->node_id, size);
1971 if (!map)
1972 map = alloc_bootmem_node(pgdat, size);
1973 pgdat->node_mem_map = map;
1761 } 1974 }
1762#ifndef CONFIG_DISCONTIGMEM 1975#ifdef CONFIG_FLATMEM
1763 /* 1976 /*
1764 * With no DISCONTIG, the global mem_map is just set as node 0's 1977 * With no DISCONTIG, the global mem_map is just set as node 0's
1765 */ 1978 */
1766 if (pgdat == NODE_DATA(0)) 1979 if (pgdat == NODE_DATA(0))
1767 mem_map = NODE_DATA(0)->node_mem_map; 1980 mem_map = NODE_DATA(0)->node_mem_map;
1768#endif 1981#endif
1982#endif /* CONFIG_FLAT_NODE_MEM_MAP */
1769} 1983}
1770 1984
1771void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1985void __init free_area_init_node(int nid, struct pglist_data *pgdat,
@@ -1781,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
1781 free_area_init_core(pgdat, zones_size, zholes_size); 1995 free_area_init_core(pgdat, zones_size, zholes_size);
1782} 1996}
1783 1997
1784#ifndef CONFIG_DISCONTIGMEM 1998#ifndef CONFIG_NEED_MULTIPLE_NODES
1785static bootmem_data_t contig_bootmem_data; 1999static bootmem_data_t contig_bootmem_data;
1786struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2000struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
1787 2001
1788EXPORT_SYMBOL(contig_page_data); 2002EXPORT_SYMBOL(contig_page_data);
2003#endif
1789 2004
1790void __init free_area_init(unsigned long *zones_size) 2005void __init free_area_init(unsigned long *zones_size)
1791{ 2006{
1792 free_area_init_node(0, &contig_page_data, zones_size, 2007 free_area_init_node(0, NODE_DATA(0), zones_size,
1793 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2008 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
1794} 2009}
1795#endif
1796 2010
1797#ifdef CONFIG_PROC_FS 2011#ifdef CONFIG_PROC_FS
1798 2012
@@ -1853,6 +2067,115 @@ struct seq_operations fragmentation_op = {
1853 .show = frag_show, 2067 .show = frag_show,
1854}; 2068};
1855 2069
2070/*
2071 * Output information about zones in @pgdat.
2072 */
2073static int zoneinfo_show(struct seq_file *m, void *arg)
2074{
2075 pg_data_t *pgdat = arg;
2076 struct zone *zone;
2077 struct zone *node_zones = pgdat->node_zones;
2078 unsigned long flags;
2079
2080 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2081 int i;
2082
2083 if (!zone->present_pages)
2084 continue;
2085
2086 spin_lock_irqsave(&zone->lock, flags);
2087 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2088 seq_printf(m,
2089 "\n pages free %lu"
2090 "\n min %lu"
2091 "\n low %lu"
2092 "\n high %lu"
2093 "\n active %lu"
2094 "\n inactive %lu"
2095 "\n scanned %lu (a: %lu i: %lu)"
2096 "\n spanned %lu"
2097 "\n present %lu",
2098 zone->free_pages,
2099 zone->pages_min,
2100 zone->pages_low,
2101 zone->pages_high,
2102 zone->nr_active,
2103 zone->nr_inactive,
2104 zone->pages_scanned,
2105 zone->nr_scan_active, zone->nr_scan_inactive,
2106 zone->spanned_pages,
2107 zone->present_pages);
2108 seq_printf(m,
2109 "\n protection: (%lu",
2110 zone->lowmem_reserve[0]);
2111 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2112 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2113 seq_printf(m,
2114 ")"
2115 "\n pagesets");
2116 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
2117 struct per_cpu_pageset *pageset;
2118 int j;
2119
2120 pageset = zone_pcp(zone, i);
2121 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2122 if (pageset->pcp[j].count)
2123 break;
2124 }
2125 if (j == ARRAY_SIZE(pageset->pcp))
2126 continue;
2127 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2128 seq_printf(m,
2129 "\n cpu: %i pcp: %i"
2130 "\n count: %i"
2131 "\n low: %i"
2132 "\n high: %i"
2133 "\n batch: %i",
2134 i, j,
2135 pageset->pcp[j].count,
2136 pageset->pcp[j].low,
2137 pageset->pcp[j].high,
2138 pageset->pcp[j].batch);
2139 }
2140#ifdef CONFIG_NUMA
2141 seq_printf(m,
2142 "\n numa_hit: %lu"
2143 "\n numa_miss: %lu"
2144 "\n numa_foreign: %lu"
2145 "\n interleave_hit: %lu"
2146 "\n local_node: %lu"
2147 "\n other_node: %lu",
2148 pageset->numa_hit,
2149 pageset->numa_miss,
2150 pageset->numa_foreign,
2151 pageset->interleave_hit,
2152 pageset->local_node,
2153 pageset->other_node);
2154#endif
2155 }
2156 seq_printf(m,
2157 "\n all_unreclaimable: %u"
2158 "\n prev_priority: %i"
2159 "\n temp_priority: %i"
2160 "\n start_pfn: %lu",
2161 zone->all_unreclaimable,
2162 zone->prev_priority,
2163 zone->temp_priority,
2164 zone->zone_start_pfn);
2165 spin_unlock_irqrestore(&zone->lock, flags);
2166 seq_putc(m, '\n');
2167 }
2168 return 0;
2169}
2170
2171struct seq_operations zoneinfo_op = {
2172 .start = frag_start, /* iterate over all zones. The same as in
2173 * fragmentation. */
2174 .next = frag_next,
2175 .stop = frag_stop,
2176 .show = zoneinfo_show,
2177};
2178
1856static char *vmstat_text[] = { 2179static char *vmstat_text[] = {
1857 "nr_dirty", 2180 "nr_dirty",
1858 "nr_writeback", 2181 "nr_writeback",
@@ -2058,10 +2381,10 @@ static void setup_per_zone_pages_min(void)
2058 min_pages = 128; 2381 min_pages = 128;
2059 zone->pages_min = min_pages; 2382 zone->pages_min = min_pages;
2060 } else { 2383 } else {
2061 /* if it's a lowmem zone, reserve a number of pages 2384 /* if it's a lowmem zone, reserve a number of pages
2062 * proportionate to the zone's size. 2385 * proportionate to the zone's size.
2063 */ 2386 */
2064 zone->pages_min = (pages_min * zone->present_pages) / 2387 zone->pages_min = (pages_min * zone->present_pages) /
2065 lowmem_pages; 2388 lowmem_pages;
2066 } 2389 }
2067 2390