diff options
author | Jeff Garzik <jgarzik@pretzel.yyz.us> | 2005-06-26 23:38:58 -0400 |
---|---|---|
committer | Jeff Garzik <jgarzik@pobox.com> | 2005-06-26 23:38:58 -0400 |
commit | 5696c1944a33b4434a9a1ebb6383b906afd43a10 (patch) | |
tree | 16fbe6ba431bcf949ee8645510b0c2fd39b5810f /mm/page_alloc.c | |
parent | 66b04a80eea60cabf9d89fd34deb3234a740052f (diff) | |
parent | 020f46a39eb7b99a575b9f4d105fce2b142acdf1 (diff) |
Merge /spare/repo/linux-2.6/
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 481 |
1 files changed, 402 insertions, 79 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f8..7ee675ad101e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages); | |||
68 | * Used by page_zone() to look up the address of the struct zone whose | 68 | * Used by page_zone() to look up the address of the struct zone whose |
69 | * id is encoded in the upper bits of page->flags | 69 | * id is encoded in the upper bits of page->flags |
70 | */ | 70 | */ |
71 | struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; | 71 | struct zone *zone_table[1 << ZONETABLE_SHIFT]; |
72 | EXPORT_SYMBOL(zone_table); | 72 | EXPORT_SYMBOL(zone_table); |
73 | 73 | ||
74 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | 74 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; |
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) | |||
105 | printk(KERN_EMERG "Backtrace:\n"); | 105 | printk(KERN_EMERG "Backtrace:\n"); |
106 | dump_stack(); | 106 | dump_stack(); |
107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | 107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); |
108 | page->flags &= ~(1 << PG_private | | 108 | page->flags &= ~(1 << PG_lru | |
109 | 1 << PG_private | | ||
109 | 1 << PG_locked | | 110 | 1 << PG_locked | |
110 | 1 << PG_lru | | ||
111 | 1 << PG_active | | 111 | 1 << PG_active | |
112 | 1 << PG_dirty | | 112 | 1 << PG_dirty | |
113 | 1 << PG_reclaim | | ||
114 | 1 << PG_slab | | ||
113 | 1 << PG_swapcache | | 115 | 1 << PG_swapcache | |
114 | 1 << PG_writeback); | 116 | 1 << PG_writeback); |
115 | set_page_count(page, 0); | 117 | set_page_count(page, 0); |
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) | |||
440 | */ | 442 | */ |
441 | static void prep_new_page(struct page *page, int order) | 443 | static void prep_new_page(struct page *page, int order) |
442 | { | 444 | { |
443 | if (page->mapping || page_mapcount(page) || | 445 | if ( page_mapcount(page) || |
444 | (page->flags & ( | 446 | page->mapping != NULL || |
447 | page_count(page) != 0 || | ||
448 | (page->flags & ( | ||
449 | 1 << PG_lru | | ||
445 | 1 << PG_private | | 450 | 1 << PG_private | |
446 | 1 << PG_locked | | 451 | 1 << PG_locked | |
447 | 1 << PG_lru | | ||
448 | 1 << PG_active | | 452 | 1 << PG_active | |
449 | 1 << PG_dirty | | 453 | 1 << PG_dirty | |
450 | 1 << PG_reclaim | | 454 | 1 << PG_reclaim | |
455 | 1 << PG_slab | | ||
451 | 1 << PG_swapcache | | 456 | 1 << PG_swapcache | |
452 | 1 << PG_writeback ))) | 457 | 1 << PG_writeback ))) |
453 | bad_page(__FUNCTION__, page); | 458 | bad_page(__FUNCTION__, page); |
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
511 | return allocated; | 516 | return allocated; |
512 | } | 517 | } |
513 | 518 | ||
519 | #ifdef CONFIG_NUMA | ||
520 | /* Called from the slab reaper to drain remote pagesets */ | ||
521 | void drain_remote_pages(void) | ||
522 | { | ||
523 | struct zone *zone; | ||
524 | int i; | ||
525 | unsigned long flags; | ||
526 | |||
527 | local_irq_save(flags); | ||
528 | for_each_zone(zone) { | ||
529 | struct per_cpu_pageset *pset; | ||
530 | |||
531 | /* Do not drain local pagesets */ | ||
532 | if (zone->zone_pgdat->node_id == numa_node_id()) | ||
533 | continue; | ||
534 | |||
535 | pset = zone->pageset[smp_processor_id()]; | ||
536 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | ||
537 | struct per_cpu_pages *pcp; | ||
538 | |||
539 | pcp = &pset->pcp[i]; | ||
540 | if (pcp->count) | ||
541 | pcp->count -= free_pages_bulk(zone, pcp->count, | ||
542 | &pcp->list, 0); | ||
543 | } | ||
544 | } | ||
545 | local_irq_restore(flags); | ||
546 | } | ||
547 | #endif | ||
548 | |||
514 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 549 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
515 | static void __drain_pages(unsigned int cpu) | 550 | static void __drain_pages(unsigned int cpu) |
516 | { | 551 | { |
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) | |||
520 | for_each_zone(zone) { | 555 | for_each_zone(zone) { |
521 | struct per_cpu_pageset *pset; | 556 | struct per_cpu_pageset *pset; |
522 | 557 | ||
523 | pset = &zone->pageset[cpu]; | 558 | pset = zone_pcp(zone, cpu); |
524 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 559 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
525 | struct per_cpu_pages *pcp; | 560 | struct per_cpu_pages *pcp; |
526 | 561 | ||
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
583 | 618 | ||
584 | local_irq_save(flags); | 619 | local_irq_save(flags); |
585 | cpu = smp_processor_id(); | 620 | cpu = smp_processor_id(); |
586 | p = &z->pageset[cpu]; | 621 | p = zone_pcp(z,cpu); |
587 | if (pg == orig) { | 622 | if (pg == orig) { |
588 | z->pageset[cpu].numa_hit++; | 623 | p->numa_hit++; |
589 | } else { | 624 | } else { |
590 | p->numa_miss++; | 625 | p->numa_miss++; |
591 | zonelist->zones[0]->pageset[cpu].numa_foreign++; | 626 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; |
592 | } | 627 | } |
593 | if (pg == NODE_DATA(numa_node_id())) | 628 | if (pg == NODE_DATA(numa_node_id())) |
594 | p->local_node++; | 629 | p->local_node++; |
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
615 | if (PageAnon(page)) | 650 | if (PageAnon(page)) |
616 | page->mapping = NULL; | 651 | page->mapping = NULL; |
617 | free_pages_check(__FUNCTION__, page); | 652 | free_pages_check(__FUNCTION__, page); |
618 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 653 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
619 | local_irq_save(flags); | 654 | local_irq_save(flags); |
620 | if (pcp->count >= pcp->high) | ||
621 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
622 | list_add(&page->lru, &pcp->list); | 655 | list_add(&page->lru, &pcp->list); |
623 | pcp->count++; | 656 | pcp->count++; |
657 | if (pcp->count >= pcp->high) | ||
658 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
624 | local_irq_restore(flags); | 659 | local_irq_restore(flags); |
625 | put_cpu(); | 660 | put_cpu(); |
626 | } | 661 | } |
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) | |||
659 | if (order == 0) { | 694 | if (order == 0) { |
660 | struct per_cpu_pages *pcp; | 695 | struct per_cpu_pages *pcp; |
661 | 696 | ||
662 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 697 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
663 | local_irq_save(flags); | 698 | local_irq_save(flags); |
664 | if (pcp->count <= pcp->low) | 699 | if (pcp->count <= pcp->low) |
665 | pcp->count += rmqueue_bulk(zone, 0, | 700 | pcp->count += rmqueue_bulk(zone, 0, |
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
724 | return 1; | 759 | return 1; |
725 | } | 760 | } |
726 | 761 | ||
762 | static inline int | ||
763 | should_reclaim_zone(struct zone *z, unsigned int gfp_mask) | ||
764 | { | ||
765 | if (!z->reclaim_pages) | ||
766 | return 0; | ||
767 | if (gfp_mask & __GFP_NORECLAIM) | ||
768 | return 0; | ||
769 | return 1; | ||
770 | } | ||
771 | |||
727 | /* | 772 | /* |
728 | * This is the 'heart' of the zoned buddy allocator. | 773 | * This is the 'heart' of the zoned buddy allocator. |
729 | */ | 774 | */ |
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | |||
760 | 805 | ||
761 | classzone_idx = zone_idx(zones[0]); | 806 | classzone_idx = zone_idx(zones[0]); |
762 | 807 | ||
763 | restart: | 808 | restart: |
764 | /* Go through the zonelist once, looking for a zone with enough free */ | 809 | /* Go through the zonelist once, looking for a zone with enough free */ |
765 | for (i = 0; (z = zones[i]) != NULL; i++) { | 810 | for (i = 0; (z = zones[i]) != NULL; i++) { |
766 | 811 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | |
767 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
768 | classzone_idx, 0, 0)) | ||
769 | continue; | ||
770 | 812 | ||
771 | if (!cpuset_zone_allowed(z)) | 813 | if (!cpuset_zone_allowed(z)) |
772 | continue; | 814 | continue; |
773 | 815 | ||
816 | /* | ||
817 | * If the zone is to attempt early page reclaim then this loop | ||
818 | * will try to reclaim pages and check the watermark a second | ||
819 | * time before giving up and falling back to the next zone. | ||
820 | */ | ||
821 | zone_reclaim_retry: | ||
822 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
823 | classzone_idx, 0, 0)) { | ||
824 | if (!do_reclaim) | ||
825 | continue; | ||
826 | else { | ||
827 | zone_reclaim(z, gfp_mask, order); | ||
828 | /* Only try reclaim once */ | ||
829 | do_reclaim = 0; | ||
830 | goto zone_reclaim_retry; | ||
831 | } | ||
832 | } | ||
833 | |||
774 | page = buffered_rmqueue(z, order, gfp_mask); | 834 | page = buffered_rmqueue(z, order, gfp_mask); |
775 | if (page) | 835 | if (page) |
776 | goto got_pg; | 836 | goto got_pg; |
@@ -829,7 +889,7 @@ rebalance: | |||
829 | reclaim_state.reclaimed_slab = 0; | 889 | reclaim_state.reclaimed_slab = 0; |
830 | p->reclaim_state = &reclaim_state; | 890 | p->reclaim_state = &reclaim_state; |
831 | 891 | ||
832 | did_some_progress = try_to_free_pages(zones, gfp_mask, order); | 892 | did_some_progress = try_to_free_pages(zones, gfp_mask); |
833 | 893 | ||
834 | p->reclaim_state = NULL; | 894 | p->reclaim_state = NULL; |
835 | p->flags &= ~PF_MEMALLOC; | 895 | p->flags &= ~PF_MEMALLOC; |
@@ -905,6 +965,7 @@ nopage: | |||
905 | " order:%d, mode:0x%x\n", | 965 | " order:%d, mode:0x%x\n", |
906 | p->comm, order, gfp_mask); | 966 | p->comm, order, gfp_mask); |
907 | dump_stack(); | 967 | dump_stack(); |
968 | show_mem(); | ||
908 | } | 969 | } |
909 | return NULL; | 970 | return NULL; |
910 | got_pg: | 971 | got_pg: |
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret) | |||
1114 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); | 1175 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); |
1115 | } | 1176 | } |
1116 | 1177 | ||
1117 | unsigned long __read_page_state(unsigned offset) | 1178 | unsigned long __read_page_state(unsigned long offset) |
1118 | { | 1179 | { |
1119 | unsigned long ret = 0; | 1180 | unsigned long ret = 0; |
1120 | int cpu; | 1181 | int cpu; |
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset) | |||
1128 | return ret; | 1189 | return ret; |
1129 | } | 1190 | } |
1130 | 1191 | ||
1131 | void __mod_page_state(unsigned offset, unsigned long delta) | 1192 | void __mod_page_state(unsigned long offset, unsigned long delta) |
1132 | { | 1193 | { |
1133 | unsigned long flags; | 1194 | unsigned long flags; |
1134 | void* ptr; | 1195 | void* ptr; |
@@ -1237,22 +1298,23 @@ void show_free_areas(void) | |||
1237 | if (!cpu_possible(cpu)) | 1298 | if (!cpu_possible(cpu)) |
1238 | continue; | 1299 | continue; |
1239 | 1300 | ||
1240 | pageset = zone->pageset + cpu; | 1301 | pageset = zone_pcp(zone, cpu); |
1241 | 1302 | ||
1242 | for (temperature = 0; temperature < 2; temperature++) | 1303 | for (temperature = 0; temperature < 2; temperature++) |
1243 | printk("cpu %d %s: low %d, high %d, batch %d\n", | 1304 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", |
1244 | cpu, | 1305 | cpu, |
1245 | temperature ? "cold" : "hot", | 1306 | temperature ? "cold" : "hot", |
1246 | pageset->pcp[temperature].low, | 1307 | pageset->pcp[temperature].low, |
1247 | pageset->pcp[temperature].high, | 1308 | pageset->pcp[temperature].high, |
1248 | pageset->pcp[temperature].batch); | 1309 | pageset->pcp[temperature].batch, |
1310 | pageset->pcp[temperature].count); | ||
1249 | } | 1311 | } |
1250 | } | 1312 | } |
1251 | 1313 | ||
1252 | get_page_state(&ps); | 1314 | get_page_state(&ps); |
1253 | get_zone_counts(&active, &inactive, &free); | 1315 | get_zone_counts(&active, &inactive, &free); |
1254 | 1316 | ||
1255 | printk("\nFree pages: %11ukB (%ukB HighMem)\n", | 1317 | printk("Free pages: %11ukB (%ukB HighMem)\n", |
1256 | K(nr_free_pages()), | 1318 | K(nr_free_pages()), |
1257 | K(nr_free_highpages())); | 1319 | K(nr_free_highpages())); |
1258 | 1320 | ||
@@ -1587,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | |||
1587 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1649 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1588 | unsigned long start_pfn) | 1650 | unsigned long start_pfn) |
1589 | { | 1651 | { |
1590 | struct page *start = pfn_to_page(start_pfn); | ||
1591 | struct page *page; | 1652 | struct page *page; |
1653 | unsigned long end_pfn = start_pfn + size; | ||
1654 | unsigned long pfn; | ||
1592 | 1655 | ||
1593 | for (page = start; page < (start + size); page++) { | 1656 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { |
1594 | set_page_zone(page, NODEZONE(nid, zone)); | 1657 | if (!early_pfn_valid(pfn)) |
1658 | continue; | ||
1659 | if (!early_pfn_in_nid(pfn, nid)) | ||
1660 | continue; | ||
1661 | page = pfn_to_page(pfn); | ||
1662 | set_page_links(page, zone, nid, pfn); | ||
1595 | set_page_count(page, 0); | 1663 | set_page_count(page, 0); |
1596 | reset_page_mapcount(page); | 1664 | reset_page_mapcount(page); |
1597 | SetPageReserved(page); | 1665 | SetPageReserved(page); |
@@ -1615,11 +1683,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1615 | } | 1683 | } |
1616 | } | 1684 | } |
1617 | 1685 | ||
1686 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
1687 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | ||
1688 | unsigned long size) | ||
1689 | { | ||
1690 | unsigned long snum = pfn_to_section_nr(pfn); | ||
1691 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
1692 | |||
1693 | if (FLAGS_HAS_NODE) | ||
1694 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
1695 | else | ||
1696 | for (; snum <= end; snum++) | ||
1697 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
1698 | } | ||
1699 | |||
1618 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1700 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1619 | #define memmap_init(size, nid, zone, start_pfn) \ | 1701 | #define memmap_init(size, nid, zone, start_pfn) \ |
1620 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1702 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
1621 | #endif | 1703 | #endif |
1622 | 1704 | ||
1705 | static int __devinit zone_batchsize(struct zone *zone) | ||
1706 | { | ||
1707 | int batch; | ||
1708 | |||
1709 | /* | ||
1710 | * The per-cpu-pages pools are set to around 1000th of the | ||
1711 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1712 | * no point in going beyond the size of L2 cache. | ||
1713 | * | ||
1714 | * OK, so we don't know how big the cache is. So guess. | ||
1715 | */ | ||
1716 | batch = zone->present_pages / 1024; | ||
1717 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1718 | batch = (256 * 1024) / PAGE_SIZE; | ||
1719 | batch /= 4; /* We effectively *= 4 below */ | ||
1720 | if (batch < 1) | ||
1721 | batch = 1; | ||
1722 | |||
1723 | /* | ||
1724 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1725 | * of 2 value was found to be more likely to have | ||
1726 | * suboptimal cache aliasing properties in some cases. | ||
1727 | * | ||
1728 | * For example if 2 tasks are alternately allocating | ||
1729 | * batches of pages, one task can end up with a lot | ||
1730 | * of pages of one half of the possible page colors | ||
1731 | * and the other with pages of the other colors. | ||
1732 | */ | ||
1733 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1734 | return batch; | ||
1735 | } | ||
1736 | |||
1737 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | ||
1738 | { | ||
1739 | struct per_cpu_pages *pcp; | ||
1740 | |||
1741 | pcp = &p->pcp[0]; /* hot */ | ||
1742 | pcp->count = 0; | ||
1743 | pcp->low = 2 * batch; | ||
1744 | pcp->high = 6 * batch; | ||
1745 | pcp->batch = max(1UL, 1 * batch); | ||
1746 | INIT_LIST_HEAD(&pcp->list); | ||
1747 | |||
1748 | pcp = &p->pcp[1]; /* cold*/ | ||
1749 | pcp->count = 0; | ||
1750 | pcp->low = 0; | ||
1751 | pcp->high = 2 * batch; | ||
1752 | pcp->batch = max(1UL, 1 * batch); | ||
1753 | INIT_LIST_HEAD(&pcp->list); | ||
1754 | } | ||
1755 | |||
1756 | #ifdef CONFIG_NUMA | ||
1757 | /* | ||
1758 | * Boot pageset table. One per cpu which is going to be used for all | ||
1759 | * zones and all nodes. The parameters will be set in such a way | ||
1760 | * that an item put on a list will immediately be handed over to | ||
1761 | * the buddy list. This is safe since pageset manipulation is done | ||
1762 | * with interrupts disabled. | ||
1763 | * | ||
1764 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
1765 | * | ||
1766 | * The boot_pagesets must be kept even after bootup is complete for | ||
1767 | * unused processors and/or zones. They do play a role for bootstrapping | ||
1768 | * hotplugged processors. | ||
1769 | * | ||
1770 | * zoneinfo_show() and maybe other functions do | ||
1771 | * not check if the processor is online before following the pageset pointer. | ||
1772 | * Other parts of the kernel may not check if the zone is available. | ||
1773 | */ | ||
1774 | static struct per_cpu_pageset | ||
1775 | boot_pageset[NR_CPUS]; | ||
1776 | |||
1777 | /* | ||
1778 | * Dynamically allocate memory for the | ||
1779 | * per cpu pageset array in struct zone. | ||
1780 | */ | ||
1781 | static int __devinit process_zones(int cpu) | ||
1782 | { | ||
1783 | struct zone *zone, *dzone; | ||
1784 | |||
1785 | for_each_zone(zone) { | ||
1786 | |||
1787 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | ||
1788 | GFP_KERNEL, cpu_to_node(cpu)); | ||
1789 | if (!zone->pageset[cpu]) | ||
1790 | goto bad; | ||
1791 | |||
1792 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | ||
1793 | } | ||
1794 | |||
1795 | return 0; | ||
1796 | bad: | ||
1797 | for_each_zone(dzone) { | ||
1798 | if (dzone == zone) | ||
1799 | break; | ||
1800 | kfree(dzone->pageset[cpu]); | ||
1801 | dzone->pageset[cpu] = NULL; | ||
1802 | } | ||
1803 | return -ENOMEM; | ||
1804 | } | ||
1805 | |||
1806 | static inline void free_zone_pagesets(int cpu) | ||
1807 | { | ||
1808 | #ifdef CONFIG_NUMA | ||
1809 | struct zone *zone; | ||
1810 | |||
1811 | for_each_zone(zone) { | ||
1812 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
1813 | |||
1814 | zone_pcp(zone, cpu) = NULL; | ||
1815 | kfree(pset); | ||
1816 | } | ||
1817 | #endif | ||
1818 | } | ||
1819 | |||
1820 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
1821 | unsigned long action, | ||
1822 | void *hcpu) | ||
1823 | { | ||
1824 | int cpu = (long)hcpu; | ||
1825 | int ret = NOTIFY_OK; | ||
1826 | |||
1827 | switch (action) { | ||
1828 | case CPU_UP_PREPARE: | ||
1829 | if (process_zones(cpu)) | ||
1830 | ret = NOTIFY_BAD; | ||
1831 | break; | ||
1832 | #ifdef CONFIG_HOTPLUG_CPU | ||
1833 | case CPU_DEAD: | ||
1834 | free_zone_pagesets(cpu); | ||
1835 | break; | ||
1836 | #endif | ||
1837 | default: | ||
1838 | break; | ||
1839 | } | ||
1840 | return ret; | ||
1841 | } | ||
1842 | |||
1843 | static struct notifier_block pageset_notifier = | ||
1844 | { &pageset_cpuup_callback, NULL, 0 }; | ||
1845 | |||
1846 | void __init setup_per_cpu_pageset() | ||
1847 | { | ||
1848 | int err; | ||
1849 | |||
1850 | /* Initialize per_cpu_pageset for cpu 0. | ||
1851 | * A cpuup callback will do this for every cpu | ||
1852 | * as it comes online | ||
1853 | */ | ||
1854 | err = process_zones(smp_processor_id()); | ||
1855 | BUG_ON(err); | ||
1856 | register_cpu_notifier(&pageset_notifier); | ||
1857 | } | ||
1858 | |||
1859 | #endif | ||
1860 | |||
1623 | /* | 1861 | /* |
1624 | * Set up the zone data structures: | 1862 | * Set up the zone data structures: |
1625 | * - mark all pages reserved | 1863 | * - mark all pages reserved |
@@ -1643,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1643 | unsigned long size, realsize; | 1881 | unsigned long size, realsize; |
1644 | unsigned long batch; | 1882 | unsigned long batch; |
1645 | 1883 | ||
1646 | zone_table[NODEZONE(nid, j)] = zone; | ||
1647 | realsize = size = zones_size[j]; | 1884 | realsize = size = zones_size[j]; |
1648 | if (zholes_size) | 1885 | if (zholes_size) |
1649 | realsize -= zholes_size[j]; | 1886 | realsize -= zholes_size[j]; |
@@ -1662,48 +1899,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1662 | 1899 | ||
1663 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1900 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
1664 | 1901 | ||
1665 | /* | 1902 | batch = zone_batchsize(zone); |
1666 | * The per-cpu-pages pools are set to around 1000th of the | ||
1667 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1668 | * no point in going beyond the size of L2 cache. | ||
1669 | * | ||
1670 | * OK, so we don't know how big the cache is. So guess. | ||
1671 | */ | ||
1672 | batch = zone->present_pages / 1024; | ||
1673 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1674 | batch = (256 * 1024) / PAGE_SIZE; | ||
1675 | batch /= 4; /* We effectively *= 4 below */ | ||
1676 | if (batch < 1) | ||
1677 | batch = 1; | ||
1678 | |||
1679 | /* | ||
1680 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1681 | * of 2 value was found to be more likely to have | ||
1682 | * suboptimal cache aliasing properties in some cases. | ||
1683 | * | ||
1684 | * For example if 2 tasks are alternately allocating | ||
1685 | * batches of pages, one task can end up with a lot | ||
1686 | * of pages of one half of the possible page colors | ||
1687 | * and the other with pages of the other colors. | ||
1688 | */ | ||
1689 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1690 | 1903 | ||
1691 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1904 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1692 | struct per_cpu_pages *pcp; | 1905 | #ifdef CONFIG_NUMA |
1693 | 1906 | /* Early boot. Slab allocator not functional yet */ | |
1694 | pcp = &zone->pageset[cpu].pcp[0]; /* hot */ | 1907 | zone->pageset[cpu] = &boot_pageset[cpu]; |
1695 | pcp->count = 0; | 1908 | setup_pageset(&boot_pageset[cpu],0); |
1696 | pcp->low = 2 * batch; | 1909 | #else |
1697 | pcp->high = 6 * batch; | 1910 | setup_pageset(zone_pcp(zone,cpu), batch); |
1698 | pcp->batch = 1 * batch; | 1911 | #endif |
1699 | INIT_LIST_HEAD(&pcp->list); | ||
1700 | |||
1701 | pcp = &zone->pageset[cpu].pcp[1]; /* cold */ | ||
1702 | pcp->count = 0; | ||
1703 | pcp->low = 0; | ||
1704 | pcp->high = 2 * batch; | ||
1705 | pcp->batch = 1 * batch; | ||
1706 | INIT_LIST_HEAD(&pcp->list); | ||
1707 | } | 1912 | } |
1708 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 1913 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
1709 | zone_names[j], realsize, batch); | 1914 | zone_names[j], realsize, batch); |
@@ -1713,6 +1918,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1713 | zone->nr_scan_inactive = 0; | 1918 | zone->nr_scan_inactive = 0; |
1714 | zone->nr_active = 0; | 1919 | zone->nr_active = 0; |
1715 | zone->nr_inactive = 0; | 1920 | zone->nr_inactive = 0; |
1921 | atomic_set(&zone->reclaim_in_progress, -1); | ||
1716 | if (!size) | 1922 | if (!size) |
1717 | continue; | 1923 | continue; |
1718 | 1924 | ||
@@ -1740,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1740 | 1946 | ||
1741 | memmap_init(size, nid, j, zone_start_pfn); | 1947 | memmap_init(size, nid, j, zone_start_pfn); |
1742 | 1948 | ||
1949 | zonetable_add(zone, nid, j, zone_start_pfn, size); | ||
1950 | |||
1743 | zone_start_pfn += size; | 1951 | zone_start_pfn += size; |
1744 | 1952 | ||
1745 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 1953 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
@@ -1748,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1748 | 1956 | ||
1749 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | 1957 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) |
1750 | { | 1958 | { |
1751 | unsigned long size; | ||
1752 | |||
1753 | /* Skip empty nodes */ | 1959 | /* Skip empty nodes */ |
1754 | if (!pgdat->node_spanned_pages) | 1960 | if (!pgdat->node_spanned_pages) |
1755 | return; | 1961 | return; |
1756 | 1962 | ||
1963 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
1757 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 1964 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
1758 | if (!pgdat->node_mem_map) { | 1965 | if (!pgdat->node_mem_map) { |
1966 | unsigned long size; | ||
1967 | struct page *map; | ||
1968 | |||
1759 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); | 1969 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); |
1760 | pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); | 1970 | map = alloc_remap(pgdat->node_id, size); |
1971 | if (!map) | ||
1972 | map = alloc_bootmem_node(pgdat, size); | ||
1973 | pgdat->node_mem_map = map; | ||
1761 | } | 1974 | } |
1762 | #ifndef CONFIG_DISCONTIGMEM | 1975 | #ifdef CONFIG_FLATMEM |
1763 | /* | 1976 | /* |
1764 | * With no DISCONTIG, the global mem_map is just set as node 0's | 1977 | * With no DISCONTIG, the global mem_map is just set as node 0's |
1765 | */ | 1978 | */ |
1766 | if (pgdat == NODE_DATA(0)) | 1979 | if (pgdat == NODE_DATA(0)) |
1767 | mem_map = NODE_DATA(0)->node_mem_map; | 1980 | mem_map = NODE_DATA(0)->node_mem_map; |
1768 | #endif | 1981 | #endif |
1982 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
1769 | } | 1983 | } |
1770 | 1984 | ||
1771 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 1985 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, |
@@ -1781,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat, | |||
1781 | free_area_init_core(pgdat, zones_size, zholes_size); | 1995 | free_area_init_core(pgdat, zones_size, zholes_size); |
1782 | } | 1996 | } |
1783 | 1997 | ||
1784 | #ifndef CONFIG_DISCONTIGMEM | 1998 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
1785 | static bootmem_data_t contig_bootmem_data; | 1999 | static bootmem_data_t contig_bootmem_data; |
1786 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2000 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
1787 | 2001 | ||
1788 | EXPORT_SYMBOL(contig_page_data); | 2002 | EXPORT_SYMBOL(contig_page_data); |
2003 | #endif | ||
1789 | 2004 | ||
1790 | void __init free_area_init(unsigned long *zones_size) | 2005 | void __init free_area_init(unsigned long *zones_size) |
1791 | { | 2006 | { |
1792 | free_area_init_node(0, &contig_page_data, zones_size, | 2007 | free_area_init_node(0, NODE_DATA(0), zones_size, |
1793 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2008 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
1794 | } | 2009 | } |
1795 | #endif | ||
1796 | 2010 | ||
1797 | #ifdef CONFIG_PROC_FS | 2011 | #ifdef CONFIG_PROC_FS |
1798 | 2012 | ||
@@ -1853,6 +2067,115 @@ struct seq_operations fragmentation_op = { | |||
1853 | .show = frag_show, | 2067 | .show = frag_show, |
1854 | }; | 2068 | }; |
1855 | 2069 | ||
2070 | /* | ||
2071 | * Output information about zones in @pgdat. | ||
2072 | */ | ||
2073 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
2074 | { | ||
2075 | pg_data_t *pgdat = arg; | ||
2076 | struct zone *zone; | ||
2077 | struct zone *node_zones = pgdat->node_zones; | ||
2078 | unsigned long flags; | ||
2079 | |||
2080 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
2081 | int i; | ||
2082 | |||
2083 | if (!zone->present_pages) | ||
2084 | continue; | ||
2085 | |||
2086 | spin_lock_irqsave(&zone->lock, flags); | ||
2087 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
2088 | seq_printf(m, | ||
2089 | "\n pages free %lu" | ||
2090 | "\n min %lu" | ||
2091 | "\n low %lu" | ||
2092 | "\n high %lu" | ||
2093 | "\n active %lu" | ||
2094 | "\n inactive %lu" | ||
2095 | "\n scanned %lu (a: %lu i: %lu)" | ||
2096 | "\n spanned %lu" | ||
2097 | "\n present %lu", | ||
2098 | zone->free_pages, | ||
2099 | zone->pages_min, | ||
2100 | zone->pages_low, | ||
2101 | zone->pages_high, | ||
2102 | zone->nr_active, | ||
2103 | zone->nr_inactive, | ||
2104 | zone->pages_scanned, | ||
2105 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
2106 | zone->spanned_pages, | ||
2107 | zone->present_pages); | ||
2108 | seq_printf(m, | ||
2109 | "\n protection: (%lu", | ||
2110 | zone->lowmem_reserve[0]); | ||
2111 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
2112 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
2113 | seq_printf(m, | ||
2114 | ")" | ||
2115 | "\n pagesets"); | ||
2116 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | ||
2117 | struct per_cpu_pageset *pageset; | ||
2118 | int j; | ||
2119 | |||
2120 | pageset = zone_pcp(zone, i); | ||
2121 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2122 | if (pageset->pcp[j].count) | ||
2123 | break; | ||
2124 | } | ||
2125 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
2126 | continue; | ||
2127 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2128 | seq_printf(m, | ||
2129 | "\n cpu: %i pcp: %i" | ||
2130 | "\n count: %i" | ||
2131 | "\n low: %i" | ||
2132 | "\n high: %i" | ||
2133 | "\n batch: %i", | ||
2134 | i, j, | ||
2135 | pageset->pcp[j].count, | ||
2136 | pageset->pcp[j].low, | ||
2137 | pageset->pcp[j].high, | ||
2138 | pageset->pcp[j].batch); | ||
2139 | } | ||
2140 | #ifdef CONFIG_NUMA | ||
2141 | seq_printf(m, | ||
2142 | "\n numa_hit: %lu" | ||
2143 | "\n numa_miss: %lu" | ||
2144 | "\n numa_foreign: %lu" | ||
2145 | "\n interleave_hit: %lu" | ||
2146 | "\n local_node: %lu" | ||
2147 | "\n other_node: %lu", | ||
2148 | pageset->numa_hit, | ||
2149 | pageset->numa_miss, | ||
2150 | pageset->numa_foreign, | ||
2151 | pageset->interleave_hit, | ||
2152 | pageset->local_node, | ||
2153 | pageset->other_node); | ||
2154 | #endif | ||
2155 | } | ||
2156 | seq_printf(m, | ||
2157 | "\n all_unreclaimable: %u" | ||
2158 | "\n prev_priority: %i" | ||
2159 | "\n temp_priority: %i" | ||
2160 | "\n start_pfn: %lu", | ||
2161 | zone->all_unreclaimable, | ||
2162 | zone->prev_priority, | ||
2163 | zone->temp_priority, | ||
2164 | zone->zone_start_pfn); | ||
2165 | spin_unlock_irqrestore(&zone->lock, flags); | ||
2166 | seq_putc(m, '\n'); | ||
2167 | } | ||
2168 | return 0; | ||
2169 | } | ||
2170 | |||
2171 | struct seq_operations zoneinfo_op = { | ||
2172 | .start = frag_start, /* iterate over all zones. The same as in | ||
2173 | * fragmentation. */ | ||
2174 | .next = frag_next, | ||
2175 | .stop = frag_stop, | ||
2176 | .show = zoneinfo_show, | ||
2177 | }; | ||
2178 | |||
1856 | static char *vmstat_text[] = { | 2179 | static char *vmstat_text[] = { |
1857 | "nr_dirty", | 2180 | "nr_dirty", |
1858 | "nr_writeback", | 2181 | "nr_writeback", |
@@ -2058,10 +2381,10 @@ static void setup_per_zone_pages_min(void) | |||
2058 | min_pages = 128; | 2381 | min_pages = 128; |
2059 | zone->pages_min = min_pages; | 2382 | zone->pages_min = min_pages; |
2060 | } else { | 2383 | } else { |
2061 | /* if it's a lowmem zone, reserve a number of pages | 2384 | /* if it's a lowmem zone, reserve a number of pages |
2062 | * proportionate to the zone's size. | 2385 | * proportionate to the zone's size. |
2063 | */ | 2386 | */ |
2064 | zone->pages_min = (pages_min * zone->present_pages) / | 2387 | zone->pages_min = (pages_min * zone->present_pages) / |
2065 | lowmem_pages; | 2388 | lowmem_pages; |
2066 | } | 2389 | } |
2067 | 2390 | ||