diff options
author | Jens Axboe <jaxboe@fusionio.com> | 2010-06-01 06:42:12 -0400 |
---|---|---|
committer | Jens Axboe <jaxboe@fusionio.com> | 2010-06-01 06:42:12 -0400 |
commit | b4ca761577535b2b4d153689ee97342797dfff05 (patch) | |
tree | 29054d55508f1faa22ec32acf7c245751af03348 /mm/page_alloc.c | |
parent | 28f4197e5d4707311febeec8a0eb97cb5fd93c97 (diff) | |
parent | 67a3e12b05e055c0415c556a315a3d3eb637e29e (diff) |
Merge branch 'master' into for-linus
Conflicts:
fs/pipe.c
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 317 |
1 files changed, 268 insertions, 49 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6326c71b663..431214b941ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -49,6 +49,7 @@ | |||
49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
51 | #include <linux/memory.h> | 51 | #include <linux/memory.h> |
52 | #include <linux/compaction.h> | ||
52 | #include <trace/events/kmem.h> | 53 | #include <trace/events/kmem.h> |
53 | #include <linux/ftrace_event.h> | 54 | #include <linux/ftrace_event.h> |
54 | 55 | ||
@@ -56,6 +57,22 @@ | |||
56 | #include <asm/div64.h> | 57 | #include <asm/div64.h> |
57 | #include "internal.h" | 58 | #include "internal.h" |
58 | 59 | ||
60 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | ||
61 | DEFINE_PER_CPU(int, numa_node); | ||
62 | EXPORT_PER_CPU_SYMBOL(numa_node); | ||
63 | #endif | ||
64 | |||
65 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
66 | /* | ||
67 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | ||
68 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. | ||
69 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() | ||
70 | * defined in <linux/topology.h>. | ||
71 | */ | ||
72 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | ||
73 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | ||
74 | #endif | ||
75 | |||
59 | /* | 76 | /* |
60 | * Array of node states. | 77 | * Array of node states. |
61 | */ | 78 | */ |
@@ -475,6 +492,8 @@ static inline void __free_one_page(struct page *page, | |||
475 | int migratetype) | 492 | int migratetype) |
476 | { | 493 | { |
477 | unsigned long page_idx; | 494 | unsigned long page_idx; |
495 | unsigned long combined_idx; | ||
496 | struct page *buddy; | ||
478 | 497 | ||
479 | if (unlikely(PageCompound(page))) | 498 | if (unlikely(PageCompound(page))) |
480 | if (unlikely(destroy_compound_page(page, order))) | 499 | if (unlikely(destroy_compound_page(page, order))) |
@@ -488,9 +507,6 @@ static inline void __free_one_page(struct page *page, | |||
488 | VM_BUG_ON(bad_range(zone, page)); | 507 | VM_BUG_ON(bad_range(zone, page)); |
489 | 508 | ||
490 | while (order < MAX_ORDER-1) { | 509 | while (order < MAX_ORDER-1) { |
491 | unsigned long combined_idx; | ||
492 | struct page *buddy; | ||
493 | |||
494 | buddy = __page_find_buddy(page, page_idx, order); | 510 | buddy = __page_find_buddy(page, page_idx, order); |
495 | if (!page_is_buddy(page, buddy, order)) | 511 | if (!page_is_buddy(page, buddy, order)) |
496 | break; | 512 | break; |
@@ -505,8 +521,29 @@ static inline void __free_one_page(struct page *page, | |||
505 | order++; | 521 | order++; |
506 | } | 522 | } |
507 | set_page_order(page, order); | 523 | set_page_order(page, order); |
508 | list_add(&page->lru, | 524 | |
509 | &zone->free_area[order].free_list[migratetype]); | 525 | /* |
526 | * If this is not the largest possible page, check if the buddy | ||
527 | * of the next-highest order is free. If it is, it's possible | ||
528 | * that pages are being freed that will coalesce soon. In case, | ||
529 | * that is happening, add the free page to the tail of the list | ||
530 | * so it's less likely to be used soon and more likely to be merged | ||
531 | * as a higher order page | ||
532 | */ | ||
533 | if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { | ||
534 | struct page *higher_page, *higher_buddy; | ||
535 | combined_idx = __find_combined_index(page_idx, order); | ||
536 | higher_page = page + combined_idx - page_idx; | ||
537 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | ||
538 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | ||
539 | list_add_tail(&page->lru, | ||
540 | &zone->free_area[order].free_list[migratetype]); | ||
541 | goto out; | ||
542 | } | ||
543 | } | ||
544 | |||
545 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); | ||
546 | out: | ||
510 | zone->free_area[order].nr_free++; | 547 | zone->free_area[order].nr_free++; |
511 | } | 548 | } |
512 | 549 | ||
@@ -599,20 +636,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
599 | spin_unlock(&zone->lock); | 636 | spin_unlock(&zone->lock); |
600 | } | 637 | } |
601 | 638 | ||
602 | static void __free_pages_ok(struct page *page, unsigned int order) | 639 | static bool free_pages_prepare(struct page *page, unsigned int order) |
603 | { | 640 | { |
604 | unsigned long flags; | ||
605 | int i; | 641 | int i; |
606 | int bad = 0; | 642 | int bad = 0; |
607 | int wasMlocked = __TestClearPageMlocked(page); | ||
608 | 643 | ||
609 | trace_mm_page_free_direct(page, order); | 644 | trace_mm_page_free_direct(page, order); |
610 | kmemcheck_free_shadow(page, order); | 645 | kmemcheck_free_shadow(page, order); |
611 | 646 | ||
612 | for (i = 0 ; i < (1 << order) ; ++i) | 647 | for (i = 0; i < (1 << order); i++) { |
613 | bad += free_pages_check(page + i); | 648 | struct page *pg = page + i; |
649 | |||
650 | if (PageAnon(pg)) | ||
651 | pg->mapping = NULL; | ||
652 | bad += free_pages_check(pg); | ||
653 | } | ||
614 | if (bad) | 654 | if (bad) |
615 | return; | 655 | return false; |
616 | 656 | ||
617 | if (!PageHighMem(page)) { | 657 | if (!PageHighMem(page)) { |
618 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 658 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
@@ -622,6 +662,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
622 | arch_free_page(page, order); | 662 | arch_free_page(page, order); |
623 | kernel_map_pages(page, 1 << order, 0); | 663 | kernel_map_pages(page, 1 << order, 0); |
624 | 664 | ||
665 | return true; | ||
666 | } | ||
667 | |||
668 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
669 | { | ||
670 | unsigned long flags; | ||
671 | int wasMlocked = __TestClearPageMlocked(page); | ||
672 | |||
673 | if (!free_pages_prepare(page, order)) | ||
674 | return; | ||
675 | |||
625 | local_irq_save(flags); | 676 | local_irq_save(flags); |
626 | if (unlikely(wasMlocked)) | 677 | if (unlikely(wasMlocked)) |
627 | free_page_mlock(page); | 678 | free_page_mlock(page); |
@@ -1107,21 +1158,9 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1107 | int migratetype; | 1158 | int migratetype; |
1108 | int wasMlocked = __TestClearPageMlocked(page); | 1159 | int wasMlocked = __TestClearPageMlocked(page); |
1109 | 1160 | ||
1110 | trace_mm_page_free_direct(page, 0); | 1161 | if (!free_pages_prepare(page, 0)) |
1111 | kmemcheck_free_shadow(page, 0); | ||
1112 | |||
1113 | if (PageAnon(page)) | ||
1114 | page->mapping = NULL; | ||
1115 | if (free_pages_check(page)) | ||
1116 | return; | 1162 | return; |
1117 | 1163 | ||
1118 | if (!PageHighMem(page)) { | ||
1119 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | ||
1120 | debug_check_no_obj_freed(page_address(page), PAGE_SIZE); | ||
1121 | } | ||
1122 | arch_free_page(page, 0); | ||
1123 | kernel_map_pages(page, 1, 0); | ||
1124 | |||
1125 | migratetype = get_pageblock_migratetype(page); | 1164 | migratetype = get_pageblock_migratetype(page); |
1126 | set_page_private(page, migratetype); | 1165 | set_page_private(page, migratetype); |
1127 | local_irq_save(flags); | 1166 | local_irq_save(flags); |
@@ -1188,6 +1227,51 @@ void split_page(struct page *page, unsigned int order) | |||
1188 | } | 1227 | } |
1189 | 1228 | ||
1190 | /* | 1229 | /* |
1230 | * Similar to split_page except the page is already free. As this is only | ||
1231 | * being used for migration, the migratetype of the block also changes. | ||
1232 | * As this is called with interrupts disabled, the caller is responsible | ||
1233 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1234 | * are enabled. | ||
1235 | * | ||
1236 | * Note: this is probably too low level an operation for use in drivers. | ||
1237 | * Please consult with lkml before using this in your driver. | ||
1238 | */ | ||
1239 | int split_free_page(struct page *page) | ||
1240 | { | ||
1241 | unsigned int order; | ||
1242 | unsigned long watermark; | ||
1243 | struct zone *zone; | ||
1244 | |||
1245 | BUG_ON(!PageBuddy(page)); | ||
1246 | |||
1247 | zone = page_zone(page); | ||
1248 | order = page_order(page); | ||
1249 | |||
1250 | /* Obey watermarks as if the page was being allocated */ | ||
1251 | watermark = low_wmark_pages(zone) + (1 << order); | ||
1252 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
1253 | return 0; | ||
1254 | |||
1255 | /* Remove page from free list */ | ||
1256 | list_del(&page->lru); | ||
1257 | zone->free_area[order].nr_free--; | ||
1258 | rmv_page_order(page); | ||
1259 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
1260 | |||
1261 | /* Split into individual pages */ | ||
1262 | set_page_refcounted(page); | ||
1263 | split_page(page, order); | ||
1264 | |||
1265 | if (order >= pageblock_order - 1) { | ||
1266 | struct page *endpage = page + (1 << order) - 1; | ||
1267 | for (; page < endpage; page += pageblock_nr_pages) | ||
1268 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
1269 | } | ||
1270 | |||
1271 | return 1 << order; | ||
1272 | } | ||
1273 | |||
1274 | /* | ||
1191 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1275 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
1192 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1276 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1193 | * or two. | 1277 | * or two. |
@@ -1693,6 +1777,62 @@ out: | |||
1693 | return page; | 1777 | return page; |
1694 | } | 1778 | } |
1695 | 1779 | ||
1780 | #ifdef CONFIG_COMPACTION | ||
1781 | /* Try memory compaction for high-order allocations before reclaim */ | ||
1782 | static struct page * | ||
1783 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
1784 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1785 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1786 | int migratetype, unsigned long *did_some_progress) | ||
1787 | { | ||
1788 | struct page *page; | ||
1789 | |||
1790 | if (!order || compaction_deferred(preferred_zone)) | ||
1791 | return NULL; | ||
1792 | |||
1793 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | ||
1794 | nodemask); | ||
1795 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
1796 | |||
1797 | /* Page migration frees to the PCP lists but we want merging */ | ||
1798 | drain_pages(get_cpu()); | ||
1799 | put_cpu(); | ||
1800 | |||
1801 | page = get_page_from_freelist(gfp_mask, nodemask, | ||
1802 | order, zonelist, high_zoneidx, | ||
1803 | alloc_flags, preferred_zone, | ||
1804 | migratetype); | ||
1805 | if (page) { | ||
1806 | preferred_zone->compact_considered = 0; | ||
1807 | preferred_zone->compact_defer_shift = 0; | ||
1808 | count_vm_event(COMPACTSUCCESS); | ||
1809 | return page; | ||
1810 | } | ||
1811 | |||
1812 | /* | ||
1813 | * It's bad if compaction run occurs and fails. | ||
1814 | * The most likely reason is that pages exist, | ||
1815 | * but not enough to satisfy watermarks. | ||
1816 | */ | ||
1817 | count_vm_event(COMPACTFAIL); | ||
1818 | defer_compaction(preferred_zone); | ||
1819 | |||
1820 | cond_resched(); | ||
1821 | } | ||
1822 | |||
1823 | return NULL; | ||
1824 | } | ||
1825 | #else | ||
1826 | static inline struct page * | ||
1827 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
1828 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1829 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1830 | int migratetype, unsigned long *did_some_progress) | ||
1831 | { | ||
1832 | return NULL; | ||
1833 | } | ||
1834 | #endif /* CONFIG_COMPACTION */ | ||
1835 | |||
1696 | /* The really slow allocator path where we enter direct reclaim */ | 1836 | /* The really slow allocator path where we enter direct reclaim */ |
1697 | static inline struct page * | 1837 | static inline struct page * |
1698 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 1838 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
@@ -1879,6 +2019,15 @@ rebalance: | |||
1879 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2019 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
1880 | goto nopage; | 2020 | goto nopage; |
1881 | 2021 | ||
2022 | /* Try direct compaction */ | ||
2023 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2024 | zonelist, high_zoneidx, | ||
2025 | nodemask, | ||
2026 | alloc_flags, preferred_zone, | ||
2027 | migratetype, &did_some_progress); | ||
2028 | if (page) | ||
2029 | goto got_pg; | ||
2030 | |||
1882 | /* Try direct reclaim and then allocating */ | 2031 | /* Try direct reclaim and then allocating */ |
1883 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2032 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
1884 | zonelist, high_zoneidx, | 2033 | zonelist, high_zoneidx, |
@@ -1970,10 +2119,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
1970 | if (unlikely(!zonelist->_zonerefs->zone)) | 2119 | if (unlikely(!zonelist->_zonerefs->zone)) |
1971 | return NULL; | 2120 | return NULL; |
1972 | 2121 | ||
2122 | get_mems_allowed(); | ||
1973 | /* The preferred zone is used for statistics later */ | 2123 | /* The preferred zone is used for statistics later */ |
1974 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2124 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); |
1975 | if (!preferred_zone) | 2125 | if (!preferred_zone) { |
2126 | put_mems_allowed(); | ||
1976 | return NULL; | 2127 | return NULL; |
2128 | } | ||
1977 | 2129 | ||
1978 | /* First allocation attempt */ | 2130 | /* First allocation attempt */ |
1979 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2131 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -1983,6 +2135,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
1983 | page = __alloc_pages_slowpath(gfp_mask, order, | 2135 | page = __alloc_pages_slowpath(gfp_mask, order, |
1984 | zonelist, high_zoneidx, nodemask, | 2136 | zonelist, high_zoneidx, nodemask, |
1985 | preferred_zone, migratetype); | 2137 | preferred_zone, migratetype); |
2138 | put_mems_allowed(); | ||
1986 | 2139 | ||
1987 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2140 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
1988 | return page; | 2141 | return page; |
@@ -2434,8 +2587,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2434 | strncpy((char*)table->data, saved_string, | 2587 | strncpy((char*)table->data, saved_string, |
2435 | NUMA_ZONELIST_ORDER_LEN); | 2588 | NUMA_ZONELIST_ORDER_LEN); |
2436 | user_zonelist_order = oldval; | 2589 | user_zonelist_order = oldval; |
2437 | } else if (oldval != user_zonelist_order) | 2590 | } else if (oldval != user_zonelist_order) { |
2438 | build_all_zonelists(); | 2591 | mutex_lock(&zonelists_mutex); |
2592 | build_all_zonelists(NULL); | ||
2593 | mutex_unlock(&zonelists_mutex); | ||
2594 | } | ||
2439 | } | 2595 | } |
2440 | out: | 2596 | out: |
2441 | mutex_unlock(&zl_order_mutex); | 2597 | mutex_unlock(&zl_order_mutex); |
@@ -2582,7 +2738,7 @@ static int default_zonelist_order(void) | |||
2582 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | 2738 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
2583 | * If they are really small and used heavily, the system can fall | 2739 | * If they are really small and used heavily, the system can fall |
2584 | * into OOM very easily. | 2740 | * into OOM very easily. |
2585 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | 2741 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
2586 | */ | 2742 | */ |
2587 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | 2743 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ |
2588 | low_kmem_size = 0; | 2744 | low_kmem_size = 0; |
@@ -2594,6 +2750,15 @@ static int default_zonelist_order(void) | |||
2594 | if (zone_type < ZONE_NORMAL) | 2750 | if (zone_type < ZONE_NORMAL) |
2595 | low_kmem_size += z->present_pages; | 2751 | low_kmem_size += z->present_pages; |
2596 | total_size += z->present_pages; | 2752 | total_size += z->present_pages; |
2753 | } else if (zone_type == ZONE_NORMAL) { | ||
2754 | /* | ||
2755 | * If any node has only lowmem, then node order | ||
2756 | * is preferred to allow kernel allocations | ||
2757 | * locally; otherwise, they can easily infringe | ||
2758 | * on other nodes when there is an abundance of | ||
2759 | * lowmem available to allocate from. | ||
2760 | */ | ||
2761 | return ZONELIST_ORDER_NODE; | ||
2597 | } | 2762 | } |
2598 | } | 2763 | } |
2599 | } | 2764 | } |
@@ -2707,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2707 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); | 2872 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
2708 | } | 2873 | } |
2709 | 2874 | ||
2875 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
2876 | /* | ||
2877 | * Return node id of node used for "local" allocations. | ||
2878 | * I.e., first node id of first zone in arg node's generic zonelist. | ||
2879 | * Used for initializing percpu 'numa_mem', which is used primarily | ||
2880 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. | ||
2881 | */ | ||
2882 | int local_memory_node(int node) | ||
2883 | { | ||
2884 | struct zone *zone; | ||
2885 | |||
2886 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), | ||
2887 | gfp_zone(GFP_KERNEL), | ||
2888 | NULL, | ||
2889 | &zone); | ||
2890 | return zone->node; | ||
2891 | } | ||
2892 | #endif | ||
2710 | 2893 | ||
2711 | #else /* CONFIG_NUMA */ | 2894 | #else /* CONFIG_NUMA */ |
2712 | 2895 | ||
@@ -2776,9 +2959,16 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2776 | */ | 2959 | */ |
2777 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 2960 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
2778 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 2961 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
2962 | static void setup_zone_pageset(struct zone *zone); | ||
2963 | |||
2964 | /* | ||
2965 | * Global mutex to protect against size modification of zonelists | ||
2966 | * as well as to serialize pageset setup for the new populated zone. | ||
2967 | */ | ||
2968 | DEFINE_MUTEX(zonelists_mutex); | ||
2779 | 2969 | ||
2780 | /* return values int ....just for stop_machine() */ | 2970 | /* return values int ....just for stop_machine() */ |
2781 | static int __build_all_zonelists(void *dummy) | 2971 | static __init_refok int __build_all_zonelists(void *data) |
2782 | { | 2972 | { |
2783 | int nid; | 2973 | int nid; |
2784 | int cpu; | 2974 | int cpu; |
@@ -2793,6 +2983,14 @@ static int __build_all_zonelists(void *dummy) | |||
2793 | build_zonelist_cache(pgdat); | 2983 | build_zonelist_cache(pgdat); |
2794 | } | 2984 | } |
2795 | 2985 | ||
2986 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
2987 | /* Setup real pagesets for the new zone */ | ||
2988 | if (data) { | ||
2989 | struct zone *zone = data; | ||
2990 | setup_zone_pageset(zone); | ||
2991 | } | ||
2992 | #endif | ||
2993 | |||
2796 | /* | 2994 | /* |
2797 | * Initialize the boot_pagesets that are going to be used | 2995 | * Initialize the boot_pagesets that are going to be used |
2798 | * for bootstrapping processors. The real pagesets for | 2996 | * for bootstrapping processors. The real pagesets for |
@@ -2806,13 +3004,31 @@ static int __build_all_zonelists(void *dummy) | |||
2806 | * needs the percpu allocator in order to allocate its pagesets | 3004 | * needs the percpu allocator in order to allocate its pagesets |
2807 | * (a chicken-egg dilemma). | 3005 | * (a chicken-egg dilemma). |
2808 | */ | 3006 | */ |
2809 | for_each_possible_cpu(cpu) | 3007 | for_each_possible_cpu(cpu) { |
2810 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | 3008 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); |
2811 | 3009 | ||
3010 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
3011 | /* | ||
3012 | * We now know the "local memory node" for each node-- | ||
3013 | * i.e., the node of the first zone in the generic zonelist. | ||
3014 | * Set up numa_mem percpu variable for on-line cpus. During | ||
3015 | * boot, only the boot cpu should be on-line; we'll init the | ||
3016 | * secondary cpus' numa_mem as they come on-line. During | ||
3017 | * node/memory hotplug, we'll fixup all on-line cpus. | ||
3018 | */ | ||
3019 | if (cpu_online(cpu)) | ||
3020 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | ||
3021 | #endif | ||
3022 | } | ||
3023 | |||
2812 | return 0; | 3024 | return 0; |
2813 | } | 3025 | } |
2814 | 3026 | ||
2815 | void build_all_zonelists(void) | 3027 | /* |
3028 | * Called with zonelists_mutex held always | ||
3029 | * unless system_state == SYSTEM_BOOTING. | ||
3030 | */ | ||
3031 | void build_all_zonelists(void *data) | ||
2816 | { | 3032 | { |
2817 | set_zonelist_order(); | 3033 | set_zonelist_order(); |
2818 | 3034 | ||
@@ -2823,7 +3039,7 @@ void build_all_zonelists(void) | |||
2823 | } else { | 3039 | } else { |
2824 | /* we have to stop all cpus to guarantee there is no user | 3040 | /* we have to stop all cpus to guarantee there is no user |
2825 | of zonelist */ | 3041 | of zonelist */ |
2826 | stop_machine(__build_all_zonelists, NULL, NULL); | 3042 | stop_machine(__build_all_zonelists, data, NULL); |
2827 | /* cpuset refresh routine should be here */ | 3043 | /* cpuset refresh routine should be here */ |
2828 | } | 3044 | } |
2829 | vm_total_pages = nr_free_pagecache_pages(); | 3045 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3146,31 +3362,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3146 | pcp->batch = PAGE_SHIFT * 8; | 3362 | pcp->batch = PAGE_SHIFT * 8; |
3147 | } | 3363 | } |
3148 | 3364 | ||
3365 | static __meminit void setup_zone_pageset(struct zone *zone) | ||
3366 | { | ||
3367 | int cpu; | ||
3368 | |||
3369 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | ||
3370 | |||
3371 | for_each_possible_cpu(cpu) { | ||
3372 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
3373 | |||
3374 | setup_pageset(pcp, zone_batchsize(zone)); | ||
3375 | |||
3376 | if (percpu_pagelist_fraction) | ||
3377 | setup_pagelist_highmark(pcp, | ||
3378 | (zone->present_pages / | ||
3379 | percpu_pagelist_fraction)); | ||
3380 | } | ||
3381 | } | ||
3382 | |||
3149 | /* | 3383 | /* |
3150 | * Allocate per cpu pagesets and initialize them. | 3384 | * Allocate per cpu pagesets and initialize them. |
3151 | * Before this call only boot pagesets were available. | 3385 | * Before this call only boot pagesets were available. |
3152 | * Boot pagesets will no longer be used by this processorr | ||
3153 | * after setup_per_cpu_pageset(). | ||
3154 | */ | 3386 | */ |
3155 | void __init setup_per_cpu_pageset(void) | 3387 | void __init setup_per_cpu_pageset(void) |
3156 | { | 3388 | { |
3157 | struct zone *zone; | 3389 | struct zone *zone; |
3158 | int cpu; | ||
3159 | |||
3160 | for_each_populated_zone(zone) { | ||
3161 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | ||
3162 | |||
3163 | for_each_possible_cpu(cpu) { | ||
3164 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
3165 | |||
3166 | setup_pageset(pcp, zone_batchsize(zone)); | ||
3167 | 3390 | ||
3168 | if (percpu_pagelist_fraction) | 3391 | for_each_populated_zone(zone) |
3169 | setup_pagelist_highmark(pcp, | 3392 | setup_zone_pageset(zone); |
3170 | (zone->present_pages / | ||
3171 | percpu_pagelist_fraction)); | ||
3172 | } | ||
3173 | } | ||
3174 | } | 3393 | } |
3175 | 3394 | ||
3176 | static noinline __init_refok | 3395 | static noinline __init_refok |