diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 481 |
1 files changed, 402 insertions, 79 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f8..7ee675ad101e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages); | |||
| 68 | * Used by page_zone() to look up the address of the struct zone whose | 68 | * Used by page_zone() to look up the address of the struct zone whose |
| 69 | * id is encoded in the upper bits of page->flags | 69 | * id is encoded in the upper bits of page->flags |
| 70 | */ | 70 | */ |
| 71 | struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; | 71 | struct zone *zone_table[1 << ZONETABLE_SHIFT]; |
| 72 | EXPORT_SYMBOL(zone_table); | 72 | EXPORT_SYMBOL(zone_table); |
| 73 | 73 | ||
| 74 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; | 74 | static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; |
| @@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) | |||
| 105 | printk(KERN_EMERG "Backtrace:\n"); | 105 | printk(KERN_EMERG "Backtrace:\n"); |
| 106 | dump_stack(); | 106 | dump_stack(); |
| 107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | 107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); |
| 108 | page->flags &= ~(1 << PG_private | | 108 | page->flags &= ~(1 << PG_lru | |
| 109 | 1 << PG_private | | ||
| 109 | 1 << PG_locked | | 110 | 1 << PG_locked | |
| 110 | 1 << PG_lru | | ||
| 111 | 1 << PG_active | | 111 | 1 << PG_active | |
| 112 | 1 << PG_dirty | | 112 | 1 << PG_dirty | |
| 113 | 1 << PG_reclaim | | ||
| 114 | 1 << PG_slab | | ||
| 113 | 1 << PG_swapcache | | 115 | 1 << PG_swapcache | |
| 114 | 1 << PG_writeback); | 116 | 1 << PG_writeback); |
| 115 | set_page_count(page, 0); | 117 | set_page_count(page, 0); |
| @@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) | |||
| 440 | */ | 442 | */ |
| 441 | static void prep_new_page(struct page *page, int order) | 443 | static void prep_new_page(struct page *page, int order) |
| 442 | { | 444 | { |
| 443 | if (page->mapping || page_mapcount(page) || | 445 | if ( page_mapcount(page) || |
| 444 | (page->flags & ( | 446 | page->mapping != NULL || |
| 447 | page_count(page) != 0 || | ||
| 448 | (page->flags & ( | ||
| 449 | 1 << PG_lru | | ||
| 445 | 1 << PG_private | | 450 | 1 << PG_private | |
| 446 | 1 << PG_locked | | 451 | 1 << PG_locked | |
| 447 | 1 << PG_lru | | ||
| 448 | 1 << PG_active | | 452 | 1 << PG_active | |
| 449 | 1 << PG_dirty | | 453 | 1 << PG_dirty | |
| 450 | 1 << PG_reclaim | | 454 | 1 << PG_reclaim | |
| 455 | 1 << PG_slab | | ||
| 451 | 1 << PG_swapcache | | 456 | 1 << PG_swapcache | |
| 452 | 1 << PG_writeback ))) | 457 | 1 << PG_writeback ))) |
| 453 | bad_page(__FUNCTION__, page); | 458 | bad_page(__FUNCTION__, page); |
| @@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 511 | return allocated; | 516 | return allocated; |
| 512 | } | 517 | } |
| 513 | 518 | ||
| 519 | #ifdef CONFIG_NUMA | ||
| 520 | /* Called from the slab reaper to drain remote pagesets */ | ||
| 521 | void drain_remote_pages(void) | ||
| 522 | { | ||
| 523 | struct zone *zone; | ||
| 524 | int i; | ||
| 525 | unsigned long flags; | ||
| 526 | |||
| 527 | local_irq_save(flags); | ||
| 528 | for_each_zone(zone) { | ||
| 529 | struct per_cpu_pageset *pset; | ||
| 530 | |||
| 531 | /* Do not drain local pagesets */ | ||
| 532 | if (zone->zone_pgdat->node_id == numa_node_id()) | ||
| 533 | continue; | ||
| 534 | |||
| 535 | pset = zone->pageset[smp_processor_id()]; | ||
| 536 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | ||
| 537 | struct per_cpu_pages *pcp; | ||
| 538 | |||
| 539 | pcp = &pset->pcp[i]; | ||
| 540 | if (pcp->count) | ||
| 541 | pcp->count -= free_pages_bulk(zone, pcp->count, | ||
| 542 | &pcp->list, 0); | ||
| 543 | } | ||
| 544 | } | ||
| 545 | local_irq_restore(flags); | ||
| 546 | } | ||
| 547 | #endif | ||
| 548 | |||
| 514 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 549 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
| 515 | static void __drain_pages(unsigned int cpu) | 550 | static void __drain_pages(unsigned int cpu) |
| 516 | { | 551 | { |
| @@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) | |||
| 520 | for_each_zone(zone) { | 555 | for_each_zone(zone) { |
| 521 | struct per_cpu_pageset *pset; | 556 | struct per_cpu_pageset *pset; |
| 522 | 557 | ||
| 523 | pset = &zone->pageset[cpu]; | 558 | pset = zone_pcp(zone, cpu); |
| 524 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 559 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
| 525 | struct per_cpu_pages *pcp; | 560 | struct per_cpu_pages *pcp; |
| 526 | 561 | ||
| @@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
| 583 | 618 | ||
| 584 | local_irq_save(flags); | 619 | local_irq_save(flags); |
| 585 | cpu = smp_processor_id(); | 620 | cpu = smp_processor_id(); |
| 586 | p = &z->pageset[cpu]; | 621 | p = zone_pcp(z,cpu); |
| 587 | if (pg == orig) { | 622 | if (pg == orig) { |
| 588 | z->pageset[cpu].numa_hit++; | 623 | p->numa_hit++; |
| 589 | } else { | 624 | } else { |
| 590 | p->numa_miss++; | 625 | p->numa_miss++; |
| 591 | zonelist->zones[0]->pageset[cpu].numa_foreign++; | 626 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; |
| 592 | } | 627 | } |
| 593 | if (pg == NODE_DATA(numa_node_id())) | 628 | if (pg == NODE_DATA(numa_node_id())) |
| 594 | p->local_node++; | 629 | p->local_node++; |
| @@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
| 615 | if (PageAnon(page)) | 650 | if (PageAnon(page)) |
| 616 | page->mapping = NULL; | 651 | page->mapping = NULL; |
| 617 | free_pages_check(__FUNCTION__, page); | 652 | free_pages_check(__FUNCTION__, page); |
| 618 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 653 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
| 619 | local_irq_save(flags); | 654 | local_irq_save(flags); |
| 620 | if (pcp->count >= pcp->high) | ||
| 621 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
| 622 | list_add(&page->lru, &pcp->list); | 655 | list_add(&page->lru, &pcp->list); |
| 623 | pcp->count++; | 656 | pcp->count++; |
| 657 | if (pcp->count >= pcp->high) | ||
| 658 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
| 624 | local_irq_restore(flags); | 659 | local_irq_restore(flags); |
| 625 | put_cpu(); | 660 | put_cpu(); |
| 626 | } | 661 | } |
| @@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) | |||
| 659 | if (order == 0) { | 694 | if (order == 0) { |
| 660 | struct per_cpu_pages *pcp; | 695 | struct per_cpu_pages *pcp; |
| 661 | 696 | ||
| 662 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 697 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
| 663 | local_irq_save(flags); | 698 | local_irq_save(flags); |
| 664 | if (pcp->count <= pcp->low) | 699 | if (pcp->count <= pcp->low) |
| 665 | pcp->count += rmqueue_bulk(zone, 0, | 700 | pcp->count += rmqueue_bulk(zone, 0, |
| @@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
| 724 | return 1; | 759 | return 1; |
| 725 | } | 760 | } |
| 726 | 761 | ||
| 762 | static inline int | ||
| 763 | should_reclaim_zone(struct zone *z, unsigned int gfp_mask) | ||
| 764 | { | ||
| 765 | if (!z->reclaim_pages) | ||
| 766 | return 0; | ||
| 767 | if (gfp_mask & __GFP_NORECLAIM) | ||
| 768 | return 0; | ||
| 769 | return 1; | ||
| 770 | } | ||
| 771 | |||
| 727 | /* | 772 | /* |
| 728 | * This is the 'heart' of the zoned buddy allocator. | 773 | * This is the 'heart' of the zoned buddy allocator. |
| 729 | */ | 774 | */ |
| @@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | |||
| 760 | 805 | ||
| 761 | classzone_idx = zone_idx(zones[0]); | 806 | classzone_idx = zone_idx(zones[0]); |
| 762 | 807 | ||
| 763 | restart: | 808 | restart: |
| 764 | /* Go through the zonelist once, looking for a zone with enough free */ | 809 | /* Go through the zonelist once, looking for a zone with enough free */ |
| 765 | for (i = 0; (z = zones[i]) != NULL; i++) { | 810 | for (i = 0; (z = zones[i]) != NULL; i++) { |
| 766 | 811 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | |
| 767 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
| 768 | classzone_idx, 0, 0)) | ||
| 769 | continue; | ||
| 770 | 812 | ||
| 771 | if (!cpuset_zone_allowed(z)) | 813 | if (!cpuset_zone_allowed(z)) |
| 772 | continue; | 814 | continue; |
| 773 | 815 | ||
| 816 | /* | ||
| 817 | * If the zone is to attempt early page reclaim then this loop | ||
| 818 | * will try to reclaim pages and check the watermark a second | ||
| 819 | * time before giving up and falling back to the next zone. | ||
| 820 | */ | ||
| 821 | zone_reclaim_retry: | ||
| 822 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
| 823 | classzone_idx, 0, 0)) { | ||
| 824 | if (!do_reclaim) | ||
| 825 | continue; | ||
| 826 | else { | ||
| 827 | zone_reclaim(z, gfp_mask, order); | ||
| 828 | /* Only try reclaim once */ | ||
| 829 | do_reclaim = 0; | ||
| 830 | goto zone_reclaim_retry; | ||
| 831 | } | ||
| 832 | } | ||
| 833 | |||
| 774 | page = buffered_rmqueue(z, order, gfp_mask); | 834 | page = buffered_rmqueue(z, order, gfp_mask); |
| 775 | if (page) | 835 | if (page) |
| 776 | goto got_pg; | 836 | goto got_pg; |
| @@ -829,7 +889,7 @@ rebalance: | |||
| 829 | reclaim_state.reclaimed_slab = 0; | 889 | reclaim_state.reclaimed_slab = 0; |
| 830 | p->reclaim_state = &reclaim_state; | 890 | p->reclaim_state = &reclaim_state; |
| 831 | 891 | ||
| 832 | did_some_progress = try_to_free_pages(zones, gfp_mask, order); | 892 | did_some_progress = try_to_free_pages(zones, gfp_mask); |
| 833 | 893 | ||
| 834 | p->reclaim_state = NULL; | 894 | p->reclaim_state = NULL; |
| 835 | p->flags &= ~PF_MEMALLOC; | 895 | p->flags &= ~PF_MEMALLOC; |
| @@ -905,6 +965,7 @@ nopage: | |||
| 905 | " order:%d, mode:0x%x\n", | 965 | " order:%d, mode:0x%x\n", |
| 906 | p->comm, order, gfp_mask); | 966 | p->comm, order, gfp_mask); |
| 907 | dump_stack(); | 967 | dump_stack(); |
| 968 | show_mem(); | ||
| 908 | } | 969 | } |
| 909 | return NULL; | 970 | return NULL; |
| 910 | got_pg: | 971 | got_pg: |
| @@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret) | |||
| 1114 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); | 1175 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); |
| 1115 | } | 1176 | } |
| 1116 | 1177 | ||
| 1117 | unsigned long __read_page_state(unsigned offset) | 1178 | unsigned long __read_page_state(unsigned long offset) |
| 1118 | { | 1179 | { |
| 1119 | unsigned long ret = 0; | 1180 | unsigned long ret = 0; |
| 1120 | int cpu; | 1181 | int cpu; |
| @@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset) | |||
| 1128 | return ret; | 1189 | return ret; |
| 1129 | } | 1190 | } |
| 1130 | 1191 | ||
| 1131 | void __mod_page_state(unsigned offset, unsigned long delta) | 1192 | void __mod_page_state(unsigned long offset, unsigned long delta) |
| 1132 | { | 1193 | { |
| 1133 | unsigned long flags; | 1194 | unsigned long flags; |
| 1134 | void* ptr; | 1195 | void* ptr; |
| @@ -1237,22 +1298,23 @@ void show_free_areas(void) | |||
| 1237 | if (!cpu_possible(cpu)) | 1298 | if (!cpu_possible(cpu)) |
| 1238 | continue; | 1299 | continue; |
| 1239 | 1300 | ||
| 1240 | pageset = zone->pageset + cpu; | 1301 | pageset = zone_pcp(zone, cpu); |
| 1241 | 1302 | ||
| 1242 | for (temperature = 0; temperature < 2; temperature++) | 1303 | for (temperature = 0; temperature < 2; temperature++) |
| 1243 | printk("cpu %d %s: low %d, high %d, batch %d\n", | 1304 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", |
| 1244 | cpu, | 1305 | cpu, |
| 1245 | temperature ? "cold" : "hot", | 1306 | temperature ? "cold" : "hot", |
| 1246 | pageset->pcp[temperature].low, | 1307 | pageset->pcp[temperature].low, |
| 1247 | pageset->pcp[temperature].high, | 1308 | pageset->pcp[temperature].high, |
| 1248 | pageset->pcp[temperature].batch); | 1309 | pageset->pcp[temperature].batch, |
| 1310 | pageset->pcp[temperature].count); | ||
| 1249 | } | 1311 | } |
| 1250 | } | 1312 | } |
| 1251 | 1313 | ||
| 1252 | get_page_state(&ps); | 1314 | get_page_state(&ps); |
| 1253 | get_zone_counts(&active, &inactive, &free); | 1315 | get_zone_counts(&active, &inactive, &free); |
| 1254 | 1316 | ||
| 1255 | printk("\nFree pages: %11ukB (%ukB HighMem)\n", | 1317 | printk("Free pages: %11ukB (%ukB HighMem)\n", |
| 1256 | K(nr_free_pages()), | 1318 | K(nr_free_pages()), |
| 1257 | K(nr_free_highpages())); | 1319 | K(nr_free_highpages())); |
| 1258 | 1320 | ||
| @@ -1587,11 +1649,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, | |||
| 1587 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1649 | void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
| 1588 | unsigned long start_pfn) | 1650 | unsigned long start_pfn) |
| 1589 | { | 1651 | { |
| 1590 | struct page *start = pfn_to_page(start_pfn); | ||
| 1591 | struct page *page; | 1652 | struct page *page; |
| 1653 | unsigned long end_pfn = start_pfn + size; | ||
| 1654 | unsigned long pfn; | ||
| 1592 | 1655 | ||
| 1593 | for (page = start; page < (start + size); page++) { | 1656 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { |
| 1594 | set_page_zone(page, NODEZONE(nid, zone)); | 1657 | if (!early_pfn_valid(pfn)) |
| 1658 | continue; | ||
| 1659 | if (!early_pfn_in_nid(pfn, nid)) | ||
| 1660 | continue; | ||
| 1661 | page = pfn_to_page(pfn); | ||
| 1662 | set_page_links(page, zone, nid, pfn); | ||
| 1595 | set_page_count(page, 0); | 1663 | set_page_count(page, 0); |
| 1596 | reset_page_mapcount(page); | 1664 | reset_page_mapcount(page); |
| 1597 | SetPageReserved(page); | 1665 | SetPageReserved(page); |
| @@ -1615,11 +1683,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
| 1615 | } | 1683 | } |
| 1616 | } | 1684 | } |
| 1617 | 1685 | ||
| 1686 | #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) | ||
| 1687 | void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, | ||
| 1688 | unsigned long size) | ||
| 1689 | { | ||
| 1690 | unsigned long snum = pfn_to_section_nr(pfn); | ||
| 1691 | unsigned long end = pfn_to_section_nr(pfn + size); | ||
| 1692 | |||
| 1693 | if (FLAGS_HAS_NODE) | ||
| 1694 | zone_table[ZONETABLE_INDEX(nid, zid)] = zone; | ||
| 1695 | else | ||
| 1696 | for (; snum <= end; snum++) | ||
| 1697 | zone_table[ZONETABLE_INDEX(snum, zid)] = zone; | ||
| 1698 | } | ||
| 1699 | |||
| 1618 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1700 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
| 1619 | #define memmap_init(size, nid, zone, start_pfn) \ | 1701 | #define memmap_init(size, nid, zone, start_pfn) \ |
| 1620 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1702 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
| 1621 | #endif | 1703 | #endif |
| 1622 | 1704 | ||
| 1705 | static int __devinit zone_batchsize(struct zone *zone) | ||
| 1706 | { | ||
| 1707 | int batch; | ||
| 1708 | |||
| 1709 | /* | ||
| 1710 | * The per-cpu-pages pools are set to around 1000th of the | ||
| 1711 | * size of the zone. But no more than 1/4 of a meg - there's | ||
| 1712 | * no point in going beyond the size of L2 cache. | ||
| 1713 | * | ||
| 1714 | * OK, so we don't know how big the cache is. So guess. | ||
| 1715 | */ | ||
| 1716 | batch = zone->present_pages / 1024; | ||
| 1717 | if (batch * PAGE_SIZE > 256 * 1024) | ||
| 1718 | batch = (256 * 1024) / PAGE_SIZE; | ||
| 1719 | batch /= 4; /* We effectively *= 4 below */ | ||
| 1720 | if (batch < 1) | ||
| 1721 | batch = 1; | ||
| 1722 | |||
| 1723 | /* | ||
| 1724 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
| 1725 | * of 2 value was found to be more likely to have | ||
| 1726 | * suboptimal cache aliasing properties in some cases. | ||
| 1727 | * | ||
| 1728 | * For example if 2 tasks are alternately allocating | ||
| 1729 | * batches of pages, one task can end up with a lot | ||
| 1730 | * of pages of one half of the possible page colors | ||
| 1731 | * and the other with pages of the other colors. | ||
| 1732 | */ | ||
| 1733 | batch = (1 << fls(batch + batch/2)) - 1; | ||
| 1734 | return batch; | ||
| 1735 | } | ||
| 1736 | |||
| 1737 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | ||
| 1738 | { | ||
| 1739 | struct per_cpu_pages *pcp; | ||
| 1740 | |||
| 1741 | pcp = &p->pcp[0]; /* hot */ | ||
| 1742 | pcp->count = 0; | ||
| 1743 | pcp->low = 2 * batch; | ||
| 1744 | pcp->high = 6 * batch; | ||
| 1745 | pcp->batch = max(1UL, 1 * batch); | ||
| 1746 | INIT_LIST_HEAD(&pcp->list); | ||
| 1747 | |||
| 1748 | pcp = &p->pcp[1]; /* cold*/ | ||
| 1749 | pcp->count = 0; | ||
| 1750 | pcp->low = 0; | ||
| 1751 | pcp->high = 2 * batch; | ||
| 1752 | pcp->batch = max(1UL, 1 * batch); | ||
| 1753 | INIT_LIST_HEAD(&pcp->list); | ||
| 1754 | } | ||
| 1755 | |||
| 1756 | #ifdef CONFIG_NUMA | ||
| 1757 | /* | ||
| 1758 | * Boot pageset table. One per cpu which is going to be used for all | ||
| 1759 | * zones and all nodes. The parameters will be set in such a way | ||
| 1760 | * that an item put on a list will immediately be handed over to | ||
| 1761 | * the buddy list. This is safe since pageset manipulation is done | ||
| 1762 | * with interrupts disabled. | ||
| 1763 | * | ||
| 1764 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
| 1765 | * | ||
| 1766 | * The boot_pagesets must be kept even after bootup is complete for | ||
| 1767 | * unused processors and/or zones. They do play a role for bootstrapping | ||
| 1768 | * hotplugged processors. | ||
| 1769 | * | ||
| 1770 | * zoneinfo_show() and maybe other functions do | ||
| 1771 | * not check if the processor is online before following the pageset pointer. | ||
| 1772 | * Other parts of the kernel may not check if the zone is available. | ||
| 1773 | */ | ||
| 1774 | static struct per_cpu_pageset | ||
| 1775 | boot_pageset[NR_CPUS]; | ||
| 1776 | |||
| 1777 | /* | ||
| 1778 | * Dynamically allocate memory for the | ||
| 1779 | * per cpu pageset array in struct zone. | ||
| 1780 | */ | ||
| 1781 | static int __devinit process_zones(int cpu) | ||
| 1782 | { | ||
| 1783 | struct zone *zone, *dzone; | ||
| 1784 | |||
| 1785 | for_each_zone(zone) { | ||
| 1786 | |||
| 1787 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | ||
| 1788 | GFP_KERNEL, cpu_to_node(cpu)); | ||
| 1789 | if (!zone->pageset[cpu]) | ||
| 1790 | goto bad; | ||
| 1791 | |||
| 1792 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | ||
| 1793 | } | ||
| 1794 | |||
| 1795 | return 0; | ||
| 1796 | bad: | ||
| 1797 | for_each_zone(dzone) { | ||
| 1798 | if (dzone == zone) | ||
| 1799 | break; | ||
| 1800 | kfree(dzone->pageset[cpu]); | ||
| 1801 | dzone->pageset[cpu] = NULL; | ||
| 1802 | } | ||
| 1803 | return -ENOMEM; | ||
| 1804 | } | ||
| 1805 | |||
| 1806 | static inline void free_zone_pagesets(int cpu) | ||
| 1807 | { | ||
| 1808 | #ifdef CONFIG_NUMA | ||
| 1809 | struct zone *zone; | ||
| 1810 | |||
| 1811 | for_each_zone(zone) { | ||
| 1812 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
| 1813 | |||
| 1814 | zone_pcp(zone, cpu) = NULL; | ||
| 1815 | kfree(pset); | ||
| 1816 | } | ||
| 1817 | #endif | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
| 1821 | unsigned long action, | ||
| 1822 | void *hcpu) | ||
| 1823 | { | ||
| 1824 | int cpu = (long)hcpu; | ||
| 1825 | int ret = NOTIFY_OK; | ||
| 1826 | |||
| 1827 | switch (action) { | ||
| 1828 | case CPU_UP_PREPARE: | ||
| 1829 | if (process_zones(cpu)) | ||
| 1830 | ret = NOTIFY_BAD; | ||
| 1831 | break; | ||
| 1832 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1833 | case CPU_DEAD: | ||
| 1834 | free_zone_pagesets(cpu); | ||
| 1835 | break; | ||
| 1836 | #endif | ||
| 1837 | default: | ||
| 1838 | break; | ||
| 1839 | } | ||
| 1840 | return ret; | ||
| 1841 | } | ||
| 1842 | |||
| 1843 | static struct notifier_block pageset_notifier = | ||
| 1844 | { &pageset_cpuup_callback, NULL, 0 }; | ||
| 1845 | |||
| 1846 | void __init setup_per_cpu_pageset() | ||
| 1847 | { | ||
| 1848 | int err; | ||
| 1849 | |||
| 1850 | /* Initialize per_cpu_pageset for cpu 0. | ||
| 1851 | * A cpuup callback will do this for every cpu | ||
| 1852 | * as it comes online | ||
| 1853 | */ | ||
| 1854 | err = process_zones(smp_processor_id()); | ||
| 1855 | BUG_ON(err); | ||
| 1856 | register_cpu_notifier(&pageset_notifier); | ||
| 1857 | } | ||
| 1858 | |||
| 1859 | #endif | ||
| 1860 | |||
| 1623 | /* | 1861 | /* |
| 1624 | * Set up the zone data structures: | 1862 | * Set up the zone data structures: |
| 1625 | * - mark all pages reserved | 1863 | * - mark all pages reserved |
| @@ -1643,7 +1881,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1643 | unsigned long size, realsize; | 1881 | unsigned long size, realsize; |
| 1644 | unsigned long batch; | 1882 | unsigned long batch; |
| 1645 | 1883 | ||
| 1646 | zone_table[NODEZONE(nid, j)] = zone; | ||
| 1647 | realsize = size = zones_size[j]; | 1884 | realsize = size = zones_size[j]; |
| 1648 | if (zholes_size) | 1885 | if (zholes_size) |
| 1649 | realsize -= zholes_size[j]; | 1886 | realsize -= zholes_size[j]; |
| @@ -1662,48 +1899,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1662 | 1899 | ||
| 1663 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1900 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
| 1664 | 1901 | ||
| 1665 | /* | 1902 | batch = zone_batchsize(zone); |
| 1666 | * The per-cpu-pages pools are set to around 1000th of the | ||
| 1667 | * size of the zone. But no more than 1/4 of a meg - there's | ||
| 1668 | * no point in going beyond the size of L2 cache. | ||
| 1669 | * | ||
| 1670 | * OK, so we don't know how big the cache is. So guess. | ||
| 1671 | */ | ||
| 1672 | batch = zone->present_pages / 1024; | ||
| 1673 | if (batch * PAGE_SIZE > 256 * 1024) | ||
| 1674 | batch = (256 * 1024) / PAGE_SIZE; | ||
| 1675 | batch /= 4; /* We effectively *= 4 below */ | ||
| 1676 | if (batch < 1) | ||
| 1677 | batch = 1; | ||
| 1678 | |||
| 1679 | /* | ||
| 1680 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
| 1681 | * of 2 value was found to be more likely to have | ||
| 1682 | * suboptimal cache aliasing properties in some cases. | ||
| 1683 | * | ||
| 1684 | * For example if 2 tasks are alternately allocating | ||
| 1685 | * batches of pages, one task can end up with a lot | ||
| 1686 | * of pages of one half of the possible page colors | ||
| 1687 | * and the other with pages of the other colors. | ||
| 1688 | */ | ||
| 1689 | batch = (1 << fls(batch + batch/2)) - 1; | ||
| 1690 | 1903 | ||
| 1691 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1904 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
| 1692 | struct per_cpu_pages *pcp; | 1905 | #ifdef CONFIG_NUMA |
| 1693 | 1906 | /* Early boot. Slab allocator not functional yet */ | |
| 1694 | pcp = &zone->pageset[cpu].pcp[0]; /* hot */ | 1907 | zone->pageset[cpu] = &boot_pageset[cpu]; |
| 1695 | pcp->count = 0; | 1908 | setup_pageset(&boot_pageset[cpu],0); |
| 1696 | pcp->low = 2 * batch; | 1909 | #else |
| 1697 | pcp->high = 6 * batch; | 1910 | setup_pageset(zone_pcp(zone,cpu), batch); |
| 1698 | pcp->batch = 1 * batch; | 1911 | #endif |
| 1699 | INIT_LIST_HEAD(&pcp->list); | ||
| 1700 | |||
| 1701 | pcp = &zone->pageset[cpu].pcp[1]; /* cold */ | ||
| 1702 | pcp->count = 0; | ||
| 1703 | pcp->low = 0; | ||
| 1704 | pcp->high = 2 * batch; | ||
| 1705 | pcp->batch = 1 * batch; | ||
| 1706 | INIT_LIST_HEAD(&pcp->list); | ||
| 1707 | } | 1912 | } |
| 1708 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 1913 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
| 1709 | zone_names[j], realsize, batch); | 1914 | zone_names[j], realsize, batch); |
| @@ -1713,6 +1918,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1713 | zone->nr_scan_inactive = 0; | 1918 | zone->nr_scan_inactive = 0; |
| 1714 | zone->nr_active = 0; | 1919 | zone->nr_active = 0; |
| 1715 | zone->nr_inactive = 0; | 1920 | zone->nr_inactive = 0; |
| 1921 | atomic_set(&zone->reclaim_in_progress, -1); | ||
| 1716 | if (!size) | 1922 | if (!size) |
| 1717 | continue; | 1923 | continue; |
| 1718 | 1924 | ||
| @@ -1740,6 +1946,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1740 | 1946 | ||
| 1741 | memmap_init(size, nid, j, zone_start_pfn); | 1947 | memmap_init(size, nid, j, zone_start_pfn); |
| 1742 | 1948 | ||
| 1949 | zonetable_add(zone, nid, j, zone_start_pfn, size); | ||
| 1950 | |||
| 1743 | zone_start_pfn += size; | 1951 | zone_start_pfn += size; |
| 1744 | 1952 | ||
| 1745 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 1953 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
| @@ -1748,24 +1956,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
| 1748 | 1956 | ||
| 1749 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | 1957 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) |
| 1750 | { | 1958 | { |
| 1751 | unsigned long size; | ||
| 1752 | |||
| 1753 | /* Skip empty nodes */ | 1959 | /* Skip empty nodes */ |
| 1754 | if (!pgdat->node_spanned_pages) | 1960 | if (!pgdat->node_spanned_pages) |
| 1755 | return; | 1961 | return; |
| 1756 | 1962 | ||
| 1963 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | ||
| 1757 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 1964 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
| 1758 | if (!pgdat->node_mem_map) { | 1965 | if (!pgdat->node_mem_map) { |
| 1966 | unsigned long size; | ||
| 1967 | struct page *map; | ||
| 1968 | |||
| 1759 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); | 1969 | size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); |
| 1760 | pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); | 1970 | map = alloc_remap(pgdat->node_id, size); |
| 1971 | if (!map) | ||
| 1972 | map = alloc_bootmem_node(pgdat, size); | ||
| 1973 | pgdat->node_mem_map = map; | ||
| 1761 | } | 1974 | } |
| 1762 | #ifndef CONFIG_DISCONTIGMEM | 1975 | #ifdef CONFIG_FLATMEM |
| 1763 | /* | 1976 | /* |
| 1764 | * With no DISCONTIG, the global mem_map is just set as node 0's | 1977 | * With no DISCONTIG, the global mem_map is just set as node 0's |
| 1765 | */ | 1978 | */ |
| 1766 | if (pgdat == NODE_DATA(0)) | 1979 | if (pgdat == NODE_DATA(0)) |
| 1767 | mem_map = NODE_DATA(0)->node_mem_map; | 1980 | mem_map = NODE_DATA(0)->node_mem_map; |
| 1768 | #endif | 1981 | #endif |
| 1982 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
| 1769 | } | 1983 | } |
| 1770 | 1984 | ||
| 1771 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, | 1985 | void __init free_area_init_node(int nid, struct pglist_data *pgdat, |
| @@ -1781,18 +1995,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat, | |||
| 1781 | free_area_init_core(pgdat, zones_size, zholes_size); | 1995 | free_area_init_core(pgdat, zones_size, zholes_size); |
| 1782 | } | 1996 | } |
| 1783 | 1997 | ||
| 1784 | #ifndef CONFIG_DISCONTIGMEM | 1998 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 1785 | static bootmem_data_t contig_bootmem_data; | 1999 | static bootmem_data_t contig_bootmem_data; |
| 1786 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2000 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
| 1787 | 2001 | ||
| 1788 | EXPORT_SYMBOL(contig_page_data); | 2002 | EXPORT_SYMBOL(contig_page_data); |
| 2003 | #endif | ||
| 1789 | 2004 | ||
| 1790 | void __init free_area_init(unsigned long *zones_size) | 2005 | void __init free_area_init(unsigned long *zones_size) |
| 1791 | { | 2006 | { |
| 1792 | free_area_init_node(0, &contig_page_data, zones_size, | 2007 | free_area_init_node(0, NODE_DATA(0), zones_size, |
| 1793 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 2008 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
| 1794 | } | 2009 | } |
| 1795 | #endif | ||
| 1796 | 2010 | ||
| 1797 | #ifdef CONFIG_PROC_FS | 2011 | #ifdef CONFIG_PROC_FS |
| 1798 | 2012 | ||
| @@ -1853,6 +2067,115 @@ struct seq_operations fragmentation_op = { | |||
| 1853 | .show = frag_show, | 2067 | .show = frag_show, |
| 1854 | }; | 2068 | }; |
| 1855 | 2069 | ||
| 2070 | /* | ||
| 2071 | * Output information about zones in @pgdat. | ||
| 2072 | */ | ||
| 2073 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
| 2074 | { | ||
| 2075 | pg_data_t *pgdat = arg; | ||
| 2076 | struct zone *zone; | ||
| 2077 | struct zone *node_zones = pgdat->node_zones; | ||
| 2078 | unsigned long flags; | ||
| 2079 | |||
| 2080 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
| 2081 | int i; | ||
| 2082 | |||
| 2083 | if (!zone->present_pages) | ||
| 2084 | continue; | ||
| 2085 | |||
| 2086 | spin_lock_irqsave(&zone->lock, flags); | ||
| 2087 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
| 2088 | seq_printf(m, | ||
| 2089 | "\n pages free %lu" | ||
| 2090 | "\n min %lu" | ||
| 2091 | "\n low %lu" | ||
| 2092 | "\n high %lu" | ||
| 2093 | "\n active %lu" | ||
| 2094 | "\n inactive %lu" | ||
| 2095 | "\n scanned %lu (a: %lu i: %lu)" | ||
| 2096 | "\n spanned %lu" | ||
| 2097 | "\n present %lu", | ||
| 2098 | zone->free_pages, | ||
| 2099 | zone->pages_min, | ||
| 2100 | zone->pages_low, | ||
| 2101 | zone->pages_high, | ||
| 2102 | zone->nr_active, | ||
| 2103 | zone->nr_inactive, | ||
| 2104 | zone->pages_scanned, | ||
| 2105 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
| 2106 | zone->spanned_pages, | ||
| 2107 | zone->present_pages); | ||
| 2108 | seq_printf(m, | ||
| 2109 | "\n protection: (%lu", | ||
| 2110 | zone->lowmem_reserve[0]); | ||
| 2111 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
| 2112 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
| 2113 | seq_printf(m, | ||
| 2114 | ")" | ||
| 2115 | "\n pagesets"); | ||
| 2116 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | ||
| 2117 | struct per_cpu_pageset *pageset; | ||
| 2118 | int j; | ||
| 2119 | |||
| 2120 | pageset = zone_pcp(zone, i); | ||
| 2121 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 2122 | if (pageset->pcp[j].count) | ||
| 2123 | break; | ||
| 2124 | } | ||
| 2125 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
| 2126 | continue; | ||
| 2127 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
| 2128 | seq_printf(m, | ||
| 2129 | "\n cpu: %i pcp: %i" | ||
| 2130 | "\n count: %i" | ||
| 2131 | "\n low: %i" | ||
| 2132 | "\n high: %i" | ||
| 2133 | "\n batch: %i", | ||
| 2134 | i, j, | ||
| 2135 | pageset->pcp[j].count, | ||
| 2136 | pageset->pcp[j].low, | ||
| 2137 | pageset->pcp[j].high, | ||
| 2138 | pageset->pcp[j].batch); | ||
| 2139 | } | ||
| 2140 | #ifdef CONFIG_NUMA | ||
| 2141 | seq_printf(m, | ||
| 2142 | "\n numa_hit: %lu" | ||
| 2143 | "\n numa_miss: %lu" | ||
| 2144 | "\n numa_foreign: %lu" | ||
| 2145 | "\n interleave_hit: %lu" | ||
| 2146 | "\n local_node: %lu" | ||
| 2147 | "\n other_node: %lu", | ||
| 2148 | pageset->numa_hit, | ||
| 2149 | pageset->numa_miss, | ||
| 2150 | pageset->numa_foreign, | ||
| 2151 | pageset->interleave_hit, | ||
| 2152 | pageset->local_node, | ||
| 2153 | pageset->other_node); | ||
| 2154 | #endif | ||
| 2155 | } | ||
| 2156 | seq_printf(m, | ||
| 2157 | "\n all_unreclaimable: %u" | ||
| 2158 | "\n prev_priority: %i" | ||
| 2159 | "\n temp_priority: %i" | ||
| 2160 | "\n start_pfn: %lu", | ||
| 2161 | zone->all_unreclaimable, | ||
| 2162 | zone->prev_priority, | ||
| 2163 | zone->temp_priority, | ||
| 2164 | zone->zone_start_pfn); | ||
| 2165 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 2166 | seq_putc(m, '\n'); | ||
| 2167 | } | ||
| 2168 | return 0; | ||
| 2169 | } | ||
| 2170 | |||
| 2171 | struct seq_operations zoneinfo_op = { | ||
| 2172 | .start = frag_start, /* iterate over all zones. The same as in | ||
| 2173 | * fragmentation. */ | ||
| 2174 | .next = frag_next, | ||
| 2175 | .stop = frag_stop, | ||
| 2176 | .show = zoneinfo_show, | ||
| 2177 | }; | ||
| 2178 | |||
| 1856 | static char *vmstat_text[] = { | 2179 | static char *vmstat_text[] = { |
| 1857 | "nr_dirty", | 2180 | "nr_dirty", |
| 1858 | "nr_writeback", | 2181 | "nr_writeback", |
| @@ -2058,10 +2381,10 @@ static void setup_per_zone_pages_min(void) | |||
| 2058 | min_pages = 128; | 2381 | min_pages = 128; |
| 2059 | zone->pages_min = min_pages; | 2382 | zone->pages_min = min_pages; |
| 2060 | } else { | 2383 | } else { |
| 2061 | /* if it's a lowmem zone, reserve a number of pages | 2384 | /* if it's a lowmem zone, reserve a number of pages |
| 2062 | * proportionate to the zone's size. | 2385 | * proportionate to the zone's size. |
| 2063 | */ | 2386 | */ |
| 2064 | zone->pages_min = (pages_min * zone->present_pages) / | 2387 | zone->pages_min = (pages_min * zone->present_pages) / |
| 2065 | lowmem_pages; | 2388 | lowmem_pages; |
| 2066 | } | 2389 | } |
| 2067 | 2390 | ||
