aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c404
1 files changed, 281 insertions, 123 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..cde5dac6229a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
16 16
17#include <linux/stddef.h> 17#include <linux/stddef.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/highmem.h>
19#include <linux/swap.h> 20#include <linux/swap.h>
20#include <linux/interrupt.h> 21#include <linux/interrupt.h>
21#include <linux/pagemap.h> 22#include <linux/pagemap.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
96#endif 97#endif
97 98
98/* work_structs for global per-cpu drains */ 99/* work_structs for global per-cpu drains */
100struct pcpu_drain {
101 struct zone *zone;
102 struct work_struct work;
103};
99DEFINE_MUTEX(pcpu_drain_mutex); 104DEFINE_MUTEX(pcpu_drain_mutex);
100DEFINE_PER_CPU(struct work_struct, pcpu_drain); 105DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
101 106
102#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 107#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
103volatile unsigned long latent_entropy __latent_entropy; 108volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
121}; 126};
122EXPORT_SYMBOL(node_states); 127EXPORT_SYMBOL(node_states);
123 128
124/* Protect totalram_pages and zone->managed_pages */ 129atomic_long_t _totalram_pages __read_mostly;
125static DEFINE_SPINLOCK(managed_page_count_lock); 130EXPORT_SYMBOL(_totalram_pages);
126
127unsigned long totalram_pages __read_mostly;
128unsigned long totalreserve_pages __read_mostly; 131unsigned long totalreserve_pages __read_mostly;
129unsigned long totalcma_pages __read_mostly; 132unsigned long totalcma_pages __read_mostly;
130 133
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
237#endif 240#endif
238}; 241};
239 242
240char * const migratetype_names[MIGRATE_TYPES] = { 243const char * const migratetype_names[MIGRATE_TYPES] = {
241 "Unmovable", 244 "Unmovable",
242 "Movable", 245 "Movable",
243 "Reclaimable", 246 "Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
263 266
264int min_free_kbytes = 1024; 267int min_free_kbytes = 1024;
265int user_min_free_kbytes = -1; 268int user_min_free_kbytes = -1;
269int watermark_boost_factor __read_mostly = 15000;
266int watermark_scale_factor = 10; 270int watermark_scale_factor = 10;
267 271
268static unsigned long nr_kernel_pages __meminitdata; 272static unsigned long nr_kernel_pages __initdata;
269static unsigned long nr_all_pages __meminitdata; 273static unsigned long nr_all_pages __initdata;
270static unsigned long dma_reserve __meminitdata; 274static unsigned long dma_reserve __initdata;
271 275
272#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 276#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
273static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; 277static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
274static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; 278static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
275static unsigned long required_kernelcore __initdata; 279static unsigned long required_kernelcore __initdata;
276static unsigned long required_kernelcore_percent __initdata; 280static unsigned long required_kernelcore_percent __initdata;
277static unsigned long required_movablecore __initdata; 281static unsigned long required_movablecore __initdata;
278static unsigned long required_movablecore_percent __initdata; 282static unsigned long required_movablecore_percent __initdata;
279static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; 283static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
280static bool mirrored_kernelcore __meminitdata; 284static bool mirrored_kernelcore __meminitdata;
281 285
282/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 286/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
294int page_group_by_mobility_disabled __read_mostly; 298int page_group_by_mobility_disabled __read_mostly;
295 299
296#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 300#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
301/*
302 * During boot we initialize deferred pages on-demand, as needed, but once
303 * page_alloc_init_late() has finished, the deferred pages are all initialized,
304 * and we can permanently disable that path.
305 */
306static DEFINE_STATIC_KEY_TRUE(deferred_pages);
307
308/*
309 * Calling kasan_free_pages() only after deferred memory initialization
310 * has completed. Poisoning pages during deferred memory init will greatly
311 * lengthen the process and cause problem in large memory systems as the
312 * deferred pages initialization is done with interrupt disabled.
313 *
314 * Assuming that there will be no reference to those newly initialized
315 * pages before they are ever allocated, this should have no effect on
316 * KASAN memory tracking as the poison will be properly inserted at page
317 * allocation time. The only corner case is when pages are allocated by
318 * on-demand allocation and then freed again before the deferred pages
319 * initialization is done, but this is not likely to happen.
320 */
321static inline void kasan_free_nondeferred_pages(struct page *page, int order)
322{
323 if (!static_branch_unlikely(&deferred_pages))
324 kasan_free_pages(page, order);
325}
326
297/* Returns true if the struct page for the pfn is uninitialised */ 327/* Returns true if the struct page for the pfn is uninitialised */
298static inline bool __meminit early_page_uninitialised(unsigned long pfn) 328static inline bool __meminit early_page_uninitialised(unsigned long pfn)
299{ 329{
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
326 /* Always populate low zones for address-constrained allocations */ 356 /* Always populate low zones for address-constrained allocations */
327 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) 357 if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
328 return false; 358 return false;
359
360 /*
361 * We start only with one section of pages, more pages are added as
362 * needed until the rest of deferred pages are initialized.
363 */
329 nr_initialised++; 364 nr_initialised++;
330 if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && 365 if ((nr_initialised > PAGES_PER_SECTION) &&
331 (pfn & (PAGES_PER_SECTION - 1)) == 0) { 366 (pfn & (PAGES_PER_SECTION - 1)) == 0) {
332 NODE_DATA(nid)->first_deferred_pfn = pfn; 367 NODE_DATA(nid)->first_deferred_pfn = pfn;
333 return true; 368 return true;
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
335 return false; 370 return false;
336} 371}
337#else 372#else
373#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
374
338static inline bool early_page_uninitialised(unsigned long pfn) 375static inline bool early_page_uninitialised(unsigned long pfn)
339{ 376{
340 return false; 377 return false;
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
426 unsigned long old_word, word; 463 unsigned long old_word, word;
427 464
428 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); 465 BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
466 BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
429 467
430 bitmap = get_pageblock_bitmap(page, pfn); 468 bitmap = get_pageblock_bitmap(page, pfn);
431 bitidx = pfn_to_bitidx(page, pfn); 469 bitidx = pfn_to_bitidx(page, pfn);
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
1037 arch_free_page(page, order); 1075 arch_free_page(page, order);
1038 kernel_poison_pages(page, 1 << order, 0); 1076 kernel_poison_pages(page, 1 << order, 0);
1039 kernel_map_pages(page, 1 << order, 0); 1077 kernel_map_pages(page, 1 << order, 0);
1040 kasan_free_pages(page, order); 1078 kasan_free_nondeferred_pages(page, order);
1041 1079
1042 return true; 1080 return true;
1043} 1081}
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1183 init_page_count(page); 1221 init_page_count(page);
1184 page_mapcount_reset(page); 1222 page_mapcount_reset(page);
1185 page_cpupid_reset_last(page); 1223 page_cpupid_reset_last(page);
1224 page_kasan_tag_reset(page);
1186 1225
1187 INIT_LIST_HEAD(&page->lru); 1226 INIT_LIST_HEAD(&page->lru);
1188#ifdef WANT_PAGE_VIRTUAL 1227#ifdef WANT_PAGE_VIRTUAL
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1279 __ClearPageReserved(p); 1318 __ClearPageReserved(p);
1280 set_page_count(p, 0); 1319 set_page_count(p, 0);
1281 1320
1282 page_zone(page)->managed_pages += nr_pages; 1321 atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
1283 set_page_refcounted(page); 1322 set_page_refcounted(page);
1284 __free_pages(page, order); 1323 __free_pages(page, order);
1285} 1324}
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data)
1606} 1645}
1607 1646
1608/* 1647/*
1609 * During boot we initialize deferred pages on-demand, as needed, but once
1610 * page_alloc_init_late() has finished, the deferred pages are all initialized,
1611 * and we can permanently disable that path.
1612 */
1613static DEFINE_STATIC_KEY_TRUE(deferred_pages);
1614
1615/*
1616 * If this zone has deferred pages, try to grow it by initializing enough 1648 * If this zone has deferred pages, try to grow it by initializing enough
1617 * deferred pages to satisfy the allocation specified by order, rounded up to 1649 * deferred pages to satisfy the allocation specified by order, rounded up to
1618 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments 1650 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1981 */ 2013 */
1982static int fallbacks[MIGRATE_TYPES][4] = { 2014static int fallbacks[MIGRATE_TYPES][4] = {
1983 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, 2015 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1984 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1985 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, 2016 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
2017 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
1986#ifdef CONFIG_CMA 2018#ifdef CONFIG_CMA
1987 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ 2019 [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
1988#endif 2020#endif
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
2129 return false; 2161 return false;
2130} 2162}
2131 2163
2164static inline void boost_watermark(struct zone *zone)
2165{
2166 unsigned long max_boost;
2167
2168 if (!watermark_boost_factor)
2169 return;
2170
2171 max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
2172 watermark_boost_factor, 10000);
2173 max_boost = max(pageblock_nr_pages, max_boost);
2174
2175 zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
2176 max_boost);
2177}
2178
2132/* 2179/*
2133 * This function implements actual steal behaviour. If order is large enough, 2180 * This function implements actual steal behaviour. If order is large enough,
2134 * we can steal whole pageblock. If not, we first move freepages in this 2181 * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
2138 * itself, so pages freed in the future will be put on the correct free list. 2185 * itself, so pages freed in the future will be put on the correct free list.
2139 */ 2186 */
2140static void steal_suitable_fallback(struct zone *zone, struct page *page, 2187static void steal_suitable_fallback(struct zone *zone, struct page *page,
2141 int start_type, bool whole_block) 2188 unsigned int alloc_flags, int start_type, bool whole_block)
2142{ 2189{
2143 unsigned int current_order = page_order(page); 2190 unsigned int current_order = page_order(page);
2144 struct free_area *area; 2191 struct free_area *area;
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
2160 goto single_page; 2207 goto single_page;
2161 } 2208 }
2162 2209
2210 /*
2211 * Boost watermarks to increase reclaim pressure to reduce the
2212 * likelihood of future fallbacks. Wake kswapd now as the node
2213 * may be balanced overall and kswapd will not wake naturally.
2214 */
2215 boost_watermark(zone);
2216 if (alloc_flags & ALLOC_KSWAPD)
2217 wakeup_kswapd(zone, 0, 0, zone_idx(zone));
2218
2163 /* We are not allowed to try stealing from the whole block */ 2219 /* We are not allowed to try stealing from the whole block */
2164 if (!whole_block) 2220 if (!whole_block)
2165 goto single_page; 2221 goto single_page;
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2258 * Limit the number reserved to 1 pageblock or roughly 1% of a zone. 2314 * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2259 * Check is race-prone but harmless. 2315 * Check is race-prone but harmless.
2260 */ 2316 */
2261 max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; 2317 max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
2262 if (zone->nr_reserved_highatomic >= max_managed) 2318 if (zone->nr_reserved_highatomic >= max_managed)
2263 return; 2319 return;
2264 2320
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2375 * condition simpler. 2431 * condition simpler.
2376 */ 2432 */
2377static __always_inline bool 2433static __always_inline bool
2378__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 2434__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
2435 unsigned int alloc_flags)
2379{ 2436{
2380 struct free_area *area; 2437 struct free_area *area;
2381 int current_order; 2438 int current_order;
2439 int min_order = order;
2382 struct page *page; 2440 struct page *page;
2383 int fallback_mt; 2441 int fallback_mt;
2384 bool can_steal; 2442 bool can_steal;
2385 2443
2386 /* 2444 /*
2445 * Do not steal pages from freelists belonging to other pageblocks
2446 * i.e. orders < pageblock_order. If there are no local zones free,
2447 * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
2448 */
2449 if (alloc_flags & ALLOC_NOFRAGMENT)
2450 min_order = pageblock_order;
2451
2452 /*
2387 * Find the largest available free page in the other list. This roughly 2453 * Find the largest available free page in the other list. This roughly
2388 * approximates finding the pageblock with the most free pages, which 2454 * approximates finding the pageblock with the most free pages, which
2389 * would be too costly to do exactly. 2455 * would be too costly to do exactly.
2390 */ 2456 */
2391 for (current_order = MAX_ORDER - 1; current_order >= order; 2457 for (current_order = MAX_ORDER - 1; current_order >= min_order;
2392 --current_order) { 2458 --current_order) {
2393 area = &(zone->free_area[current_order]); 2459 area = &(zone->free_area[current_order]);
2394 fallback_mt = find_suitable_fallback(area, current_order, 2460 fallback_mt = find_suitable_fallback(area, current_order,
@@ -2433,7 +2499,8 @@ do_steal:
2433 page = list_first_entry(&area->free_list[fallback_mt], 2499 page = list_first_entry(&area->free_list[fallback_mt],
2434 struct page, lru); 2500 struct page, lru);
2435 2501
2436 steal_suitable_fallback(zone, page, start_migratetype, can_steal); 2502 steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
2503 can_steal);
2437 2504
2438 trace_mm_page_alloc_extfrag(page, order, current_order, 2505 trace_mm_page_alloc_extfrag(page, order, current_order,
2439 start_migratetype, fallback_mt); 2506 start_migratetype, fallback_mt);
@@ -2447,7 +2514,8 @@ do_steal:
2447 * Call me with the zone->lock already held. 2514 * Call me with the zone->lock already held.
2448 */ 2515 */
2449static __always_inline struct page * 2516static __always_inline struct page *
2450__rmqueue(struct zone *zone, unsigned int order, int migratetype) 2517__rmqueue(struct zone *zone, unsigned int order, int migratetype,
2518 unsigned int alloc_flags)
2451{ 2519{
2452 struct page *page; 2520 struct page *page;
2453 2521
@@ -2457,7 +2525,8 @@ retry:
2457 if (migratetype == MIGRATE_MOVABLE) 2525 if (migratetype == MIGRATE_MOVABLE)
2458 page = __rmqueue_cma_fallback(zone, order); 2526 page = __rmqueue_cma_fallback(zone, order);
2459 2527
2460 if (!page && __rmqueue_fallback(zone, order, migratetype)) 2528 if (!page && __rmqueue_fallback(zone, order, migratetype,
2529 alloc_flags))
2461 goto retry; 2530 goto retry;
2462 } 2531 }
2463 2532
@@ -2472,13 +2541,14 @@ retry:
2472 */ 2541 */
2473static int rmqueue_bulk(struct zone *zone, unsigned int order, 2542static int rmqueue_bulk(struct zone *zone, unsigned int order,
2474 unsigned long count, struct list_head *list, 2543 unsigned long count, struct list_head *list,
2475 int migratetype) 2544 int migratetype, unsigned int alloc_flags)
2476{ 2545{
2477 int i, alloced = 0; 2546 int i, alloced = 0;
2478 2547
2479 spin_lock(&zone->lock); 2548 spin_lock(&zone->lock);
2480 for (i = 0; i < count; ++i) { 2549 for (i = 0; i < count; ++i) {
2481 struct page *page = __rmqueue(zone, order, migratetype); 2550 struct page *page = __rmqueue(zone, order, migratetype,
2551 alloc_flags);
2482 if (unlikely(page == NULL)) 2552 if (unlikely(page == NULL))
2483 break; 2553 break;
2484 2554
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone)
2592 2662
2593static void drain_local_pages_wq(struct work_struct *work) 2663static void drain_local_pages_wq(struct work_struct *work)
2594{ 2664{
2665 struct pcpu_drain *drain;
2666
2667 drain = container_of(work, struct pcpu_drain, work);
2668
2595 /* 2669 /*
2596 * drain_all_pages doesn't use proper cpu hotplug protection so 2670 * drain_all_pages doesn't use proper cpu hotplug protection so
2597 * we can race with cpu offline when the WQ can move this from 2671 * we can race with cpu offline when the WQ can move this from
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work)
2600 * a different one. 2674 * a different one.
2601 */ 2675 */
2602 preempt_disable(); 2676 preempt_disable();
2603 drain_local_pages(NULL); 2677 drain_local_pages(drain->zone);
2604 preempt_enable(); 2678 preempt_enable();
2605} 2679}
2606 2680
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone)
2671 } 2745 }
2672 2746
2673 for_each_cpu(cpu, &cpus_with_pcps) { 2747 for_each_cpu(cpu, &cpus_with_pcps) {
2674 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); 2748 struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
2675 INIT_WORK(work, drain_local_pages_wq); 2749
2676 queue_work_on(cpu, mm_percpu_wq, work); 2750 drain->zone = zone;
2751 INIT_WORK(&drain->work, drain_local_pages_wq);
2752 queue_work_on(cpu, mm_percpu_wq, &drain->work);
2677 } 2753 }
2678 for_each_cpu(cpu, &cpus_with_pcps) 2754 for_each_cpu(cpu, &cpus_with_pcps)
2679 flush_work(per_cpu_ptr(&pcpu_drain, cpu)); 2755 flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
2680 2756
2681 mutex_unlock(&pcpu_drain_mutex); 2757 mutex_unlock(&pcpu_drain_mutex);
2682} 2758}
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2934 3010
2935/* Remove page from the per-cpu list, caller must protect the list */ 3011/* Remove page from the per-cpu list, caller must protect the list */
2936static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, 3012static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
3013 unsigned int alloc_flags,
2937 struct per_cpu_pages *pcp, 3014 struct per_cpu_pages *pcp,
2938 struct list_head *list) 3015 struct list_head *list)
2939{ 3016{
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2943 if (list_empty(list)) { 3020 if (list_empty(list)) {
2944 pcp->count += rmqueue_bulk(zone, 0, 3021 pcp->count += rmqueue_bulk(zone, 0,
2945 pcp->batch, list, 3022 pcp->batch, list,
2946 migratetype); 3023 migratetype, alloc_flags);
2947 if (unlikely(list_empty(list))) 3024 if (unlikely(list_empty(list)))
2948 return NULL; 3025 return NULL;
2949 } 3026 }
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2959/* Lock and remove page from the per-cpu list */ 3036/* Lock and remove page from the per-cpu list */
2960static struct page *rmqueue_pcplist(struct zone *preferred_zone, 3037static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2961 struct zone *zone, unsigned int order, 3038 struct zone *zone, unsigned int order,
2962 gfp_t gfp_flags, int migratetype) 3039 gfp_t gfp_flags, int migratetype,
3040 unsigned int alloc_flags)
2963{ 3041{
2964 struct per_cpu_pages *pcp; 3042 struct per_cpu_pages *pcp;
2965 struct list_head *list; 3043 struct list_head *list;
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2969 local_irq_save(flags); 3047 local_irq_save(flags);
2970 pcp = &this_cpu_ptr(zone->pageset)->pcp; 3048 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2971 list = &pcp->lists[migratetype]; 3049 list = &pcp->lists[migratetype];
2972 page = __rmqueue_pcplist(zone, migratetype, pcp, list); 3050 page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
2973 if (page) { 3051 if (page) {
2974 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 3052 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2975 zone_statistics(preferred_zone, zone); 3053 zone_statistics(preferred_zone, zone);
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone,
2992 3070
2993 if (likely(order == 0)) { 3071 if (likely(order == 0)) {
2994 page = rmqueue_pcplist(preferred_zone, zone, order, 3072 page = rmqueue_pcplist(preferred_zone, zone, order,
2995 gfp_flags, migratetype); 3073 gfp_flags, migratetype, alloc_flags);
2996 goto out; 3074 goto out;
2997 } 3075 }
2998 3076
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone,
3011 trace_mm_page_alloc_zone_locked(page, order, migratetype); 3089 trace_mm_page_alloc_zone_locked(page, order, migratetype);
3012 } 3090 }
3013 if (!page) 3091 if (!page)
3014 page = __rmqueue(zone, order, migratetype); 3092 page = __rmqueue(zone, order, migratetype, alloc_flags);
3015 } while (page && check_new_pages(page, order)); 3093 } while (page && check_new_pages(page, order));
3016 spin_unlock(&zone->lock); 3094 spin_unlock(&zone->lock);
3017 if (!page) 3095 if (!page)
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str)
3053} 3131}
3054__setup("fail_page_alloc=", setup_fail_page_alloc); 3132__setup("fail_page_alloc=", setup_fail_page_alloc);
3055 3133
3056static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3134static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3057{ 3135{
3058 if (order < fail_page_alloc.min_order) 3136 if (order < fail_page_alloc.min_order)
3059 return false; 3137 return false;
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs);
3103 3181
3104#else /* CONFIG_FAIL_PAGE_ALLOC */ 3182#else /* CONFIG_FAIL_PAGE_ALLOC */
3105 3183
3106static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 3184static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3107{ 3185{
3108 return false; 3186 return false;
3109} 3187}
3110 3188
3111#endif /* CONFIG_FAIL_PAGE_ALLOC */ 3189#endif /* CONFIG_FAIL_PAGE_ALLOC */
3112 3190
3191static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
3192{
3193 return __should_fail_alloc_page(gfp_mask, order);
3194}
3195ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
3196
3113/* 3197/*
3114 * Return true if free base pages are above 'mark'. For high-order checks it 3198 * Return true if free base pages are above 'mark'. For high-order checks it
3115 * will return true of the order-0 watermark is reached and there is at least 3199 * will return true of the order-0 watermark is reached and there is at least
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3254#endif /* CONFIG_NUMA */ 3338#endif /* CONFIG_NUMA */
3255 3339
3256/* 3340/*
3341 * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
3342 * fragmentation is subtle. If the preferred zone was HIGHMEM then
3343 * premature use of a lower zone may cause lowmem pressure problems that
3344 * are worse than fragmentation. If the next zone is ZONE_DMA then it is
3345 * probably too small. It only makes sense to spread allocations to avoid
3346 * fragmentation between the Normal and DMA32 zones.
3347 */
3348static inline unsigned int
3349alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
3350{
3351 unsigned int alloc_flags = 0;
3352
3353 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3354 alloc_flags |= ALLOC_KSWAPD;
3355
3356#ifdef CONFIG_ZONE_DMA32
3357 if (zone_idx(zone) != ZONE_NORMAL)
3358 goto out;
3359
3360 /*
3361 * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
3362 * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
3363 * on UMA that if Normal is populated then so is DMA32.
3364 */
3365 BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
3366 if (nr_online_nodes > 1 && !populated_zone(--zone))
3367 goto out;
3368
3369out:
3370#endif /* CONFIG_ZONE_DMA32 */
3371 return alloc_flags;
3372}
3373
3374/*
3257 * get_page_from_freelist goes through the zonelist trying to allocate 3375 * get_page_from_freelist goes through the zonelist trying to allocate
3258 * a page. 3376 * a page.
3259 */ 3377 */
@@ -3261,14 +3379,18 @@ static struct page *
3261get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, 3379get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3262 const struct alloc_context *ac) 3380 const struct alloc_context *ac)
3263{ 3381{
3264 struct zoneref *z = ac->preferred_zoneref; 3382 struct zoneref *z;
3265 struct zone *zone; 3383 struct zone *zone;
3266 struct pglist_data *last_pgdat_dirty_limit = NULL; 3384 struct pglist_data *last_pgdat_dirty_limit = NULL;
3385 bool no_fallback;
3267 3386
3387retry:
3268 /* 3388 /*
3269 * Scan zonelist, looking for a zone with enough free. 3389 * Scan zonelist, looking for a zone with enough free.
3270 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 3390 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3271 */ 3391 */
3392 no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
3393 z = ac->preferred_zoneref;
3272 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, 3394 for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3273 ac->nodemask) { 3395 ac->nodemask) {
3274 struct page *page; 3396 struct page *page;
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3307 } 3429 }
3308 } 3430 }
3309 3431
3310 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 3432 if (no_fallback && nr_online_nodes > 1 &&
3433 zone != ac->preferred_zoneref->zone) {
3434 int local_nid;
3435
3436 /*
3437 * If moving to a remote node, retry but allow
3438 * fragmenting fallbacks. Locality is more important
3439 * than fragmentation avoidance.
3440 */
3441 local_nid = zone_to_nid(ac->preferred_zoneref->zone);
3442 if (zone_to_nid(zone) != local_nid) {
3443 alloc_flags &= ~ALLOC_NOFRAGMENT;
3444 goto retry;
3445 }
3446 }
3447
3448 mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
3311 if (!zone_watermark_fast(zone, order, mark, 3449 if (!zone_watermark_fast(zone, order, mark,
3312 ac_classzone_idx(ac), alloc_flags)) { 3450 ac_classzone_idx(ac), alloc_flags)) {
3313 int ret; 3451 int ret;
@@ -3374,6 +3512,15 @@ try_this_zone:
3374 } 3512 }
3375 } 3513 }
3376 3514
3515 /*
3516 * It's possible on a UMA machine to get through all zones that are
3517 * fragmented. If avoiding fragmentation, reset and try again.
3518 */
3519 if (no_fallback) {
3520 alloc_flags &= ~ALLOC_NOFRAGMENT;
3521 goto retry;
3522 }
3523
3377 return NULL; 3524 return NULL;
3378} 3525}
3379 3526
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3413 va_start(args, fmt); 3560 va_start(args, fmt);
3414 vaf.fmt = fmt; 3561 vaf.fmt = fmt;
3415 vaf.va = &args; 3562 vaf.va = &args;
3416 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", 3563 pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
3417 current->comm, &vaf, gfp_mask, &gfp_mask, 3564 current->comm, &vaf, gfp_mask, &gfp_mask,
3418 nodemask_pr_args(nodemask)); 3565 nodemask_pr_args(nodemask));
3419 va_end(args); 3566 va_end(args);
3420 3567
3421 cpuset_print_current_mems_allowed(); 3568 cpuset_print_current_mems_allowed();
3422 3569 pr_cont("\n");
3423 dump_stack(); 3570 dump_stack();
3424 warn_alloc_show_mem(gfp_mask, nodemask); 3571 warn_alloc_show_mem(gfp_mask, nodemask);
3425} 3572}
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3861 } else if (unlikely(rt_task(current)) && !in_interrupt()) 4008 } else if (unlikely(rt_task(current)) && !in_interrupt())
3862 alloc_flags |= ALLOC_HARDER; 4009 alloc_flags |= ALLOC_HARDER;
3863 4010
4011 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4012 alloc_flags |= ALLOC_KSWAPD;
4013
3864#ifdef CONFIG_CMA 4014#ifdef CONFIG_CMA
3865 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 4015 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3866 alloc_flags |= ALLOC_CMA; 4016 alloc_flags |= ALLOC_CMA;
@@ -4092,7 +4242,7 @@ retry_cpuset:
4092 if (!ac->preferred_zoneref->zone) 4242 if (!ac->preferred_zoneref->zone)
4093 goto nopage; 4243 goto nopage;
4094 4244
4095 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4245 if (alloc_flags & ALLOC_KSWAPD)
4096 wake_all_kswapds(order, gfp_mask, ac); 4246 wake_all_kswapds(order, gfp_mask, ac);
4097 4247
4098 /* 4248 /*
@@ -4150,7 +4300,7 @@ retry_cpuset:
4150 4300
4151retry: 4301retry:
4152 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4302 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4153 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4303 if (alloc_flags & ALLOC_KSWAPD)
4154 wake_all_kswapds(order, gfp_mask, ac); 4304 wake_all_kswapds(order, gfp_mask, ac);
4155 4305
4156 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4306 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4369 4519
4370 finalise_ac(gfp_mask, &ac); 4520 finalise_ac(gfp_mask, &ac);
4371 4521
4522 /*
4523 * Forbid the first pass from falling back to types that fragment
4524 * memory until all local zones are considered.
4525 */
4526 alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
4527
4372 /* First allocation attempt */ 4528 /* First allocation attempt */
4373 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 4529 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4374 if (likely(page)) 4530 if (likely(page))
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
4427} 4583}
4428EXPORT_SYMBOL(get_zeroed_page); 4584EXPORT_SYMBOL(get_zeroed_page);
4429 4585
4430void __free_pages(struct page *page, unsigned int order) 4586static inline void free_the_page(struct page *page, unsigned int order)
4431{ 4587{
4432 if (put_page_testzero(page)) { 4588 if (order == 0) /* Via pcp? */
4433 if (order == 0) 4589 free_unref_page(page);
4434 free_unref_page(page); 4590 else
4435 else 4591 __free_pages_ok(page, order);
4436 __free_pages_ok(page, order);
4437 }
4438} 4592}
4439 4593
4594void __free_pages(struct page *page, unsigned int order)
4595{
4596 if (put_page_testzero(page))
4597 free_the_page(page, order);
4598}
4440EXPORT_SYMBOL(__free_pages); 4599EXPORT_SYMBOL(__free_pages);
4441 4600
4442void free_pages(unsigned long addr, unsigned int order) 4601void free_pages(unsigned long addr, unsigned int order)
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
4485{ 4644{
4486 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); 4645 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4487 4646
4488 if (page_ref_sub_and_test(page, count)) { 4647 if (page_ref_sub_and_test(page, count))
4489 unsigned int order = compound_order(page); 4648 free_the_page(page, compound_order(page));
4490
4491 if (order == 0)
4492 free_unref_page(page);
4493 else
4494 __free_pages_ok(page, order);
4495 }
4496} 4649}
4497EXPORT_SYMBOL(__page_frag_cache_drain); 4650EXPORT_SYMBOL(__page_frag_cache_drain);
4498 4651
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr)
4558 struct page *page = virt_to_head_page(addr); 4711 struct page *page = virt_to_head_page(addr);
4559 4712
4560 if (unlikely(put_page_testzero(page))) 4713 if (unlikely(put_page_testzero(page)))
4561 __free_pages_ok(page, compound_order(page)); 4714 free_the_page(page, compound_order(page));
4562} 4715}
4563EXPORT_SYMBOL(page_frag_free); 4716EXPORT_SYMBOL(page_frag_free);
4564 4717
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset)
4660 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 4813 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4661 4814
4662 for_each_zone_zonelist(zone, z, zonelist, offset) { 4815 for_each_zone_zonelist(zone, z, zonelist, offset) {
4663 unsigned long size = zone->managed_pages; 4816 unsigned long size = zone_managed_pages(zone);
4664 unsigned long high = high_wmark_pages(zone); 4817 unsigned long high = high_wmark_pages(zone);
4665 if (size > high) 4818 if (size > high)
4666 sum += size - high; 4819 sum += size - high;
@@ -4712,7 +4865,7 @@ long si_mem_available(void)
4712 pages[lru] = global_node_page_state(NR_LRU_BASE + lru); 4865 pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4713 4866
4714 for_each_zone(zone) 4867 for_each_zone(zone)
4715 wmark_low += zone->watermark[WMARK_LOW]; 4868 wmark_low += low_wmark_pages(zone);
4716 4869
4717 /* 4870 /*
4718 * Estimate the amount of memory available for userspace allocations, 4871 * Estimate the amount of memory available for userspace allocations,
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
4746 4899
4747void si_meminfo(struct sysinfo *val) 4900void si_meminfo(struct sysinfo *val)
4748{ 4901{
4749 val->totalram = totalram_pages; 4902 val->totalram = totalram_pages();
4750 val->sharedram = global_node_page_state(NR_SHMEM); 4903 val->sharedram = global_node_page_state(NR_SHMEM);
4751 val->freeram = global_zone_page_state(NR_FREE_PAGES); 4904 val->freeram = global_zone_page_state(NR_FREE_PAGES);
4752 val->bufferram = nr_blockdev_pages(); 4905 val->bufferram = nr_blockdev_pages();
4753 val->totalhigh = totalhigh_pages; 4906 val->totalhigh = totalhigh_pages();
4754 val->freehigh = nr_free_highpages(); 4907 val->freehigh = nr_free_highpages();
4755 val->mem_unit = PAGE_SIZE; 4908 val->mem_unit = PAGE_SIZE;
4756} 4909}
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4767 pg_data_t *pgdat = NODE_DATA(nid); 4920 pg_data_t *pgdat = NODE_DATA(nid);
4768 4921
4769 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 4922 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4770 managed_pages += pgdat->node_zones[zone_type].managed_pages; 4923 managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
4771 val->totalram = managed_pages; 4924 val->totalram = managed_pages;
4772 val->sharedram = node_page_state(pgdat, NR_SHMEM); 4925 val->sharedram = node_page_state(pgdat, NR_SHMEM);
4773 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); 4926 val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4776 struct zone *zone = &pgdat->node_zones[zone_type]; 4929 struct zone *zone = &pgdat->node_zones[zone_type];
4777 4930
4778 if (is_highmem(zone)) { 4931 if (is_highmem(zone)) {
4779 managed_highpages += zone->managed_pages; 4932 managed_highpages += zone_managed_pages(zone);
4780 free_highpages += zone_page_state(zone, NR_FREE_PAGES); 4933 free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4781 } 4934 }
4782 } 4935 }
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4983 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), 5136 K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4984 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), 5137 K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4985 K(zone->present_pages), 5138 K(zone->present_pages),
4986 K(zone->managed_pages), 5139 K(zone_managed_pages(zone)),
4987 K(zone_page_state(zone, NR_MLOCK)), 5140 K(zone_page_state(zone, NR_MLOCK)),
4988 zone_page_state(zone, NR_KERNEL_STACK_KB), 5141 zone_page_state(zone, NR_KERNEL_STACK_KB),
4989 K(zone_page_state(zone, NR_PAGETABLE)), 5142 K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone)
5655 * The per-cpu-pages pools are set to around 1000th of the 5808 * The per-cpu-pages pools are set to around 1000th of the
5656 * size of the zone. 5809 * size of the zone.
5657 */ 5810 */
5658 batch = zone->managed_pages / 1024; 5811 batch = zone_managed_pages(zone) / 1024;
5659 /* But no more than a meg. */ 5812 /* But no more than a meg. */
5660 if (batch * PAGE_SIZE > 1024 * 1024) 5813 if (batch * PAGE_SIZE > 1024 * 1024)
5661 batch = (1024 * 1024) / PAGE_SIZE; 5814 batch = (1024 * 1024) / PAGE_SIZE;
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p)
5736 memset(p, 0, sizeof(*p)); 5889 memset(p, 0, sizeof(*p));
5737 5890
5738 pcp = &p->pcp; 5891 pcp = &p->pcp;
5739 pcp->count = 0;
5740 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 5892 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5741 INIT_LIST_HEAD(&pcp->lists[migratetype]); 5893 INIT_LIST_HEAD(&pcp->lists[migratetype]);
5742} 5894}
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
5766{ 5918{
5767 if (percpu_pagelist_fraction) 5919 if (percpu_pagelist_fraction)
5768 pageset_set_high(pcp, 5920 pageset_set_high(pcp,
5769 (zone->managed_pages / 5921 (zone_managed_pages(zone) /
5770 percpu_pagelist_fraction)); 5922 percpu_pagelist_fraction));
5771 else 5923 else
5772 pageset_set_batch(pcp, zone_batchsize(zone)); 5924 pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
5920 * with no available memory, a warning is printed and the start and end 6072 * with no available memory, a warning is printed and the start and end
5921 * PFNs will be 0. 6073 * PFNs will be 0.
5922 */ 6074 */
5923void __meminit get_pfn_range_for_nid(unsigned int nid, 6075void __init get_pfn_range_for_nid(unsigned int nid,
5924 unsigned long *start_pfn, unsigned long *end_pfn) 6076 unsigned long *start_pfn, unsigned long *end_pfn)
5925{ 6077{
5926 unsigned long this_start_pfn, this_end_pfn; 6078 unsigned long this_start_pfn, this_end_pfn;
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void)
5969 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that 6121 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
5970 * zones within a node are in order of monotonic increases memory addresses 6122 * zones within a node are in order of monotonic increases memory addresses
5971 */ 6123 */
5972static void __meminit adjust_zone_range_for_zone_movable(int nid, 6124static void __init adjust_zone_range_for_zone_movable(int nid,
5973 unsigned long zone_type, 6125 unsigned long zone_type,
5974 unsigned long node_start_pfn, 6126 unsigned long node_start_pfn,
5975 unsigned long node_end_pfn, 6127 unsigned long node_end_pfn,
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
6000 * Return the number of pages a zone spans in a node, including holes 6152 * Return the number of pages a zone spans in a node, including holes
6001 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 6153 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
6002 */ 6154 */
6003static unsigned long __meminit zone_spanned_pages_in_node(int nid, 6155static unsigned long __init zone_spanned_pages_in_node(int nid,
6004 unsigned long zone_type, 6156 unsigned long zone_type,
6005 unsigned long node_start_pfn, 6157 unsigned long node_start_pfn,
6006 unsigned long node_end_pfn, 6158 unsigned long node_end_pfn,
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
6035 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 6187 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
6036 * then all holes in the requested range will be accounted for. 6188 * then all holes in the requested range will be accounted for.
6037 */ 6189 */
6038unsigned long __meminit __absent_pages_in_range(int nid, 6190unsigned long __init __absent_pages_in_range(int nid,
6039 unsigned long range_start_pfn, 6191 unsigned long range_start_pfn,
6040 unsigned long range_end_pfn) 6192 unsigned long range_end_pfn)
6041{ 6193{
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
6065} 6217}
6066 6218
6067/* Return the number of page frames in holes in a zone on a node */ 6219/* Return the number of page frames in holes in a zone on a node */
6068static unsigned long __meminit zone_absent_pages_in_node(int nid, 6220static unsigned long __init zone_absent_pages_in_node(int nid,
6069 unsigned long zone_type, 6221 unsigned long zone_type,
6070 unsigned long node_start_pfn, 6222 unsigned long node_start_pfn,
6071 unsigned long node_end_pfn, 6223 unsigned long node_end_pfn,
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
6117} 6269}
6118 6270
6119#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6271#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6120static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 6272static inline unsigned long __init zone_spanned_pages_in_node(int nid,
6121 unsigned long zone_type, 6273 unsigned long zone_type,
6122 unsigned long node_start_pfn, 6274 unsigned long node_start_pfn,
6123 unsigned long node_end_pfn, 6275 unsigned long node_end_pfn,
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
6136 return zones_size[zone_type]; 6288 return zones_size[zone_type];
6137} 6289}
6138 6290
6139static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 6291static inline unsigned long __init zone_absent_pages_in_node(int nid,
6140 unsigned long zone_type, 6292 unsigned long zone_type,
6141 unsigned long node_start_pfn, 6293 unsigned long node_start_pfn,
6142 unsigned long node_end_pfn, 6294 unsigned long node_end_pfn,
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
6150 6302
6151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 6303#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6152 6304
6153static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 6305static void __init calculate_node_totalpages(struct pglist_data *pgdat,
6154 unsigned long node_start_pfn, 6306 unsigned long node_start_pfn,
6155 unsigned long node_end_pfn, 6307 unsigned long node_end_pfn,
6156 unsigned long *zones_size, 6308 unsigned long *zones_size,
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
6323static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, 6475static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
6324 unsigned long remaining_pages) 6476 unsigned long remaining_pages)
6325{ 6477{
6326 zone->managed_pages = remaining_pages; 6478 atomic_long_set(&zone->managed_pages, remaining_pages);
6327 zone_set_nid(zone, nid); 6479 zone_set_nid(zone, nid);
6328 zone->name = zone_names[idx]; 6480 zone->name = zone_names[idx];
6329 zone->zone_pgdat = NODE_DATA(nid); 6481 zone->zone_pgdat = NODE_DATA(nid);
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
6476#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 6628#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6477static inline void pgdat_set_deferred_range(pg_data_t *pgdat) 6629static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
6478{ 6630{
6479 /*
6480 * We start only with one section of pages, more pages are added as
6481 * needed until the rest of deferred pages are initialized.
6482 */
6483 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6484 pgdat->node_spanned_pages);
6485 pgdat->first_deferred_pfn = ULONG_MAX; 6631 pgdat->first_deferred_pfn = ULONG_MAX;
6486} 6632}
6487#else 6633#else
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore);
7075 7221
7076void adjust_managed_page_count(struct page *page, long count) 7222void adjust_managed_page_count(struct page *page, long count)
7077{ 7223{
7078 spin_lock(&managed_page_count_lock); 7224 atomic_long_add(count, &page_zone(page)->managed_pages);
7079 page_zone(page)->managed_pages += count; 7225 totalram_pages_add(count);
7080 totalram_pages += count;
7081#ifdef CONFIG_HIGHMEM 7226#ifdef CONFIG_HIGHMEM
7082 if (PageHighMem(page)) 7227 if (PageHighMem(page))
7083 totalhigh_pages += count; 7228 totalhigh_pages_add(count);
7084#endif 7229#endif
7085 spin_unlock(&managed_page_count_lock);
7086} 7230}
7087EXPORT_SYMBOL(adjust_managed_page_count); 7231EXPORT_SYMBOL(adjust_managed_page_count);
7088 7232
7089unsigned long free_reserved_area(void *start, void *end, int poison, char *s) 7233unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
7090{ 7234{
7091 void *pos; 7235 void *pos;
7092 unsigned long pages = 0; 7236 unsigned long pages = 0;
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area);
7123void free_highmem_page(struct page *page) 7267void free_highmem_page(struct page *page)
7124{ 7268{
7125 __free_reserved_page(page); 7269 __free_reserved_page(page);
7126 totalram_pages++; 7270 totalram_pages_inc();
7127 page_zone(page)->managed_pages++; 7271 atomic_long_inc(&page_zone(page)->managed_pages);
7128 totalhigh_pages++; 7272 totalhigh_pages_inc();
7129} 7273}
7130#endif 7274#endif
7131 7275
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str)
7174 physpages << (PAGE_SHIFT - 10), 7318 physpages << (PAGE_SHIFT - 10),
7175 codesize >> 10, datasize >> 10, rosize >> 10, 7319 codesize >> 10, datasize >> 10, rosize >> 10,
7176 (init_data_size + init_code_size) >> 10, bss_size >> 10, 7320 (init_data_size + init_code_size) >> 10, bss_size >> 10,
7177 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), 7321 (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
7178 totalcma_pages << (PAGE_SHIFT - 10), 7322 totalcma_pages << (PAGE_SHIFT - 10),
7179#ifdef CONFIG_HIGHMEM 7323#ifdef CONFIG_HIGHMEM
7180 totalhigh_pages << (PAGE_SHIFT - 10), 7324 totalhigh_pages() << (PAGE_SHIFT - 10),
7181#endif 7325#endif
7182 str ? ", " : "", str ? str : ""); 7326 str ? ", " : "", str ? str : "");
7183} 7327}
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void)
7257 for (i = 0; i < MAX_NR_ZONES; i++) { 7401 for (i = 0; i < MAX_NR_ZONES; i++) {
7258 struct zone *zone = pgdat->node_zones + i; 7402 struct zone *zone = pgdat->node_zones + i;
7259 long max = 0; 7403 long max = 0;
7404 unsigned long managed_pages = zone_managed_pages(zone);
7260 7405
7261 /* Find valid and maximum lowmem_reserve in the zone */ 7406 /* Find valid and maximum lowmem_reserve in the zone */
7262 for (j = i; j < MAX_NR_ZONES; j++) { 7407 for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void)
7267 /* we treat the high watermark as reserved pages. */ 7412 /* we treat the high watermark as reserved pages. */
7268 max += high_wmark_pages(zone); 7413 max += high_wmark_pages(zone);
7269 7414
7270 if (max > zone->managed_pages) 7415 if (max > managed_pages)
7271 max = zone->managed_pages; 7416 max = managed_pages;
7272 7417
7273 pgdat->totalreserve_pages += max; 7418 pgdat->totalreserve_pages += max;
7274 7419
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void)
7292 for_each_online_pgdat(pgdat) { 7437 for_each_online_pgdat(pgdat) {
7293 for (j = 0; j < MAX_NR_ZONES; j++) { 7438 for (j = 0; j < MAX_NR_ZONES; j++) {
7294 struct zone *zone = pgdat->node_zones + j; 7439 struct zone *zone = pgdat->node_zones + j;
7295 unsigned long managed_pages = zone->managed_pages; 7440 unsigned long managed_pages = zone_managed_pages(zone);
7296 7441
7297 zone->lowmem_reserve[j] = 0; 7442 zone->lowmem_reserve[j] = 0;
7298 7443
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void)
7310 lower_zone->lowmem_reserve[j] = 7455 lower_zone->lowmem_reserve[j] =
7311 managed_pages / sysctl_lowmem_reserve_ratio[idx]; 7456 managed_pages / sysctl_lowmem_reserve_ratio[idx];
7312 } 7457 }
7313 managed_pages += lower_zone->managed_pages; 7458 managed_pages += zone_managed_pages(lower_zone);
7314 } 7459 }
7315 } 7460 }
7316 } 7461 }
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void)
7329 /* Calculate total number of !ZONE_HIGHMEM pages */ 7474 /* Calculate total number of !ZONE_HIGHMEM pages */
7330 for_each_zone(zone) { 7475 for_each_zone(zone) {
7331 if (!is_highmem(zone)) 7476 if (!is_highmem(zone))
7332 lowmem_pages += zone->managed_pages; 7477 lowmem_pages += zone_managed_pages(zone);
7333 } 7478 }
7334 7479
7335 for_each_zone(zone) { 7480 for_each_zone(zone) {
7336 u64 tmp; 7481 u64 tmp;
7337 7482
7338 spin_lock_irqsave(&zone->lock, flags); 7483 spin_lock_irqsave(&zone->lock, flags);
7339 tmp = (u64)pages_min * zone->managed_pages; 7484 tmp = (u64)pages_min * zone_managed_pages(zone);
7340 do_div(tmp, lowmem_pages); 7485 do_div(tmp, lowmem_pages);
7341 if (is_highmem(zone)) { 7486 if (is_highmem(zone)) {
7342 /* 7487 /*
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void)
7350 */ 7495 */
7351 unsigned long min_pages; 7496 unsigned long min_pages;
7352 7497
7353 min_pages = zone->managed_pages / 1024; 7498 min_pages = zone_managed_pages(zone) / 1024;
7354 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); 7499 min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
7355 zone->watermark[WMARK_MIN] = min_pages; 7500 zone->_watermark[WMARK_MIN] = min_pages;
7356 } else { 7501 } else {
7357 /* 7502 /*
7358 * If it's a lowmem zone, reserve a number of pages 7503 * If it's a lowmem zone, reserve a number of pages
7359 * proportionate to the zone's size. 7504 * proportionate to the zone's size.
7360 */ 7505 */
7361 zone->watermark[WMARK_MIN] = tmp; 7506 zone->_watermark[WMARK_MIN] = tmp;
7362 } 7507 }
7363 7508
7364 /* 7509 /*
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void)
7367 * ensure a minimum size on small systems. 7512 * ensure a minimum size on small systems.
7368 */ 7513 */
7369 tmp = max_t(u64, tmp >> 2, 7514 tmp = max_t(u64, tmp >> 2,
7370 mult_frac(zone->managed_pages, 7515 mult_frac(zone_managed_pages(zone),
7371 watermark_scale_factor, 10000)); 7516 watermark_scale_factor, 10000));
7372 7517
7373 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; 7518 zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
7374 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; 7519 zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
7520 zone->watermark_boost = 0;
7375 7521
7376 spin_unlock_irqrestore(&zone->lock, flags); 7522 spin_unlock_irqrestore(&zone->lock, flags);
7377 } 7523 }
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7472 return 0; 7618 return 0;
7473} 7619}
7474 7620
7621int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
7622 void __user *buffer, size_t *length, loff_t *ppos)
7623{
7624 int rc;
7625
7626 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7627 if (rc)
7628 return rc;
7629
7630 return 0;
7631}
7632
7475int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, 7633int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7476 void __user *buffer, size_t *length, loff_t *ppos) 7634 void __user *buffer, size_t *length, loff_t *ppos)
7477{ 7635{
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void)
7497 pgdat->min_unmapped_pages = 0; 7655 pgdat->min_unmapped_pages = 0;
7498 7656
7499 for_each_zone(zone) 7657 for_each_zone(zone)
7500 zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages * 7658 zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
7501 sysctl_min_unmapped_ratio) / 100; 7659 sysctl_min_unmapped_ratio) / 100;
7502} 7660}
7503 7661
7504 7662
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void)
7525 pgdat->min_slab_pages = 0; 7683 pgdat->min_slab_pages = 0;
7526 7684
7527 for_each_zone(zone) 7685 for_each_zone(zone)
7528 zone->zone_pgdat->min_slab_pages += (zone->managed_pages * 7686 zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
7529 sysctl_min_slab_ratio) / 100; 7687 sysctl_min_slab_ratio) / 100;
7530} 7688}
7531 7689
7532int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, 7690int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
7766 * race condition. So you can't expect this function should be exact. 7924 * race condition. So you can't expect this function should be exact.
7767 */ 7925 */
7768bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7926bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7769 int migratetype, 7927 int migratetype, int flags)
7770 bool skip_hwpoisoned_pages)
7771{ 7928{
7772 unsigned long pfn, iter, found; 7929 unsigned long pfn, iter, found;
7773 7930
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7841 * The HWPoisoned page may be not in buddy system, and 7998 * The HWPoisoned page may be not in buddy system, and
7842 * page_count() is not 0. 7999 * page_count() is not 0.
7843 */ 8000 */
7844 if (skip_hwpoisoned_pages && PageHWPoison(page)) 8001 if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
7845 continue; 8002 continue;
7846 8003
7847 if (__PageMovable(page)) 8004 if (__PageMovable(page))
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7868 return false; 8025 return false;
7869unmovable: 8026unmovable:
7870 WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); 8027 WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
8028 if (flags & REPORT_FAILURE)
8029 dump_page(pfn_to_page(pfn+iter), "unmovable page");
7871 return true; 8030 return true;
7872} 8031}
7873 8032
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7994 */ 8153 */
7995 8154
7996 ret = start_isolate_page_range(pfn_max_align_down(start), 8155 ret = start_isolate_page_range(pfn_max_align_down(start),
7997 pfn_max_align_up(end), migratetype, 8156 pfn_max_align_up(end), migratetype, 0);
7998 false);
7999 if (ret) 8157 if (ret)
8000 return ret; 8158 return ret;
8001 8159