aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c494
1 files changed, 201 insertions, 293 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef44ad736ca1..736d8e1b6381 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/compaction.h> 54#include <linux/compaction.h>
55#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 56#include <linux/prefetch.h>
59#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
60#include <linux/migrate.h> 58#include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
85 */ 83 */
86DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 84DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87EXPORT_PER_CPU_SYMBOL(_numa_mem_); 85EXPORT_PER_CPU_SYMBOL(_numa_mem_);
86int _node_numa_mem_[MAX_NUMNODES];
88#endif 87#endif
89 88
90/* 89/*
@@ -680,9 +679,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
680 int migratetype = 0; 679 int migratetype = 0;
681 int batch_free = 0; 680 int batch_free = 0;
682 int to_free = count; 681 int to_free = count;
682 unsigned long nr_scanned;
683 683
684 spin_lock(&zone->lock); 684 spin_lock(&zone->lock);
685 zone->pages_scanned = 0; 685 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
686 if (nr_scanned)
687 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
686 688
687 while (to_free) { 689 while (to_free) {
688 struct page *page; 690 struct page *page;
@@ -731,8 +733,11 @@ static void free_one_page(struct zone *zone,
731 unsigned int order, 733 unsigned int order,
732 int migratetype) 734 int migratetype)
733{ 735{
736 unsigned long nr_scanned;
734 spin_lock(&zone->lock); 737 spin_lock(&zone->lock);
735 zone->pages_scanned = 0; 738 nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
739 if (nr_scanned)
740 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
736 741
737 __free_one_page(page, pfn, zone, order, migratetype); 742 __free_one_page(page, pfn, zone, order, migratetype);
738 if (unlikely(!is_migrate_isolate(migratetype))) 743 if (unlikely(!is_migrate_isolate(migratetype)))
@@ -1008,7 +1013,7 @@ int move_freepages(struct zone *zone,
1008 * Remove at a later date when no bug reports exist related to 1013 * Remove at a later date when no bug reports exist related to
1009 * grouping pages by mobility 1014 * grouping pages by mobility
1010 */ 1015 */
1011 BUG_ON(page_zone(start_page) != page_zone(end_page)); 1016 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1012#endif 1017#endif
1013 1018
1014 for (page = start_page; page <= end_page;) { 1019 for (page = start_page; page <= end_page;) {
@@ -1257,15 +1262,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1257void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) 1262void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1258{ 1263{
1259 unsigned long flags; 1264 unsigned long flags;
1260 int to_drain; 1265 int to_drain, batch;
1261 unsigned long batch;
1262 1266
1263 local_irq_save(flags); 1267 local_irq_save(flags);
1264 batch = ACCESS_ONCE(pcp->batch); 1268 batch = ACCESS_ONCE(pcp->batch);
1265 if (pcp->count >= batch) 1269 to_drain = min(pcp->count, batch);
1266 to_drain = batch;
1267 else
1268 to_drain = pcp->count;
1269 if (to_drain > 0) { 1270 if (to_drain > 0) {
1270 free_pcppages_bulk(zone, to_drain, pcp); 1271 free_pcppages_bulk(zone, to_drain, pcp);
1271 pcp->count -= to_drain; 1272 pcp->count -= to_drain;
@@ -1610,6 +1611,9 @@ again:
1610 } 1611 }
1611 1612
1612 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1613 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1614 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
1615 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
1616 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1613 1617
1614 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1618 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1615 zone_statistics(preferred_zone, zone, gfp_flags); 1619 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1712,7 +1716,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1712{ 1716{
1713 /* free_pages my go negative - that's OK */ 1717 /* free_pages my go negative - that's OK */
1714 long min = mark; 1718 long min = mark;
1715 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1716 int o; 1719 int o;
1717 long free_cma = 0; 1720 long free_cma = 0;
1718 1721
@@ -1727,7 +1730,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1727 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); 1730 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
1728#endif 1731#endif
1729 1732
1730 if (free_pages - free_cma <= min + lowmem_reserve) 1733 if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
1731 return false; 1734 return false;
1732 for (o = 0; o < order; o++) { 1735 for (o = 0; o < order; o++) {
1733 /* At the next order, this order's pages become unavailable */ 1736 /* At the next order, this order's pages become unavailable */
@@ -1922,6 +1925,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1922 1925
1923#endif /* CONFIG_NUMA */ 1926#endif /* CONFIG_NUMA */
1924 1927
1928static void reset_alloc_batches(struct zone *preferred_zone)
1929{
1930 struct zone *zone = preferred_zone->zone_pgdat->node_zones;
1931
1932 do {
1933 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1934 high_wmark_pages(zone) - low_wmark_pages(zone) -
1935 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1936 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1937 } while (zone++ != preferred_zone);
1938}
1939
1925/* 1940/*
1926 * get_page_from_freelist goes through the zonelist trying to allocate 1941 * get_page_from_freelist goes through the zonelist trying to allocate
1927 * a page. 1942 * a page.
@@ -1939,8 +1954,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1939 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1954 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1940 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && 1955 bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
1941 (gfp_mask & __GFP_WRITE); 1956 (gfp_mask & __GFP_WRITE);
1957 int nr_fair_skipped = 0;
1958 bool zonelist_rescan;
1942 1959
1943zonelist_scan: 1960zonelist_scan:
1961 zonelist_rescan = false;
1962
1944 /* 1963 /*
1945 * Scan zonelist, looking for a zone with enough free. 1964 * Scan zonelist, looking for a zone with enough free.
1946 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 1965 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1964,9 +1983,11 @@ zonelist_scan:
1964 */ 1983 */
1965 if (alloc_flags & ALLOC_FAIR) { 1984 if (alloc_flags & ALLOC_FAIR) {
1966 if (!zone_local(preferred_zone, zone)) 1985 if (!zone_local(preferred_zone, zone))
1986 break;
1987 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
1988 nr_fair_skipped++;
1967 continue; 1989 continue;
1968 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) 1990 }
1969 continue;
1970 } 1991 }
1971 /* 1992 /*
1972 * When allocating a page cache page for writing, we 1993 * When allocating a page cache page for writing, we
@@ -2072,13 +2093,7 @@ this_zone_full:
2072 zlc_mark_zone_full(zonelist, z); 2093 zlc_mark_zone_full(zonelist, z);
2073 } 2094 }
2074 2095
2075 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { 2096 if (page) {
2076 /* Disable zlc cache for second zonelist scan */
2077 zlc_active = 0;
2078 goto zonelist_scan;
2079 }
2080
2081 if (page)
2082 /* 2097 /*
2083 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was 2098 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2084 * necessary to allocate the page. The expectation is 2099 * necessary to allocate the page. The expectation is
@@ -2087,8 +2102,37 @@ this_zone_full:
2087 * for !PFMEMALLOC purposes. 2102 * for !PFMEMALLOC purposes.
2088 */ 2103 */
2089 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); 2104 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2105 return page;
2106 }
2090 2107
2091 return page; 2108 /*
2109 * The first pass makes sure allocations are spread fairly within the
2110 * local node. However, the local node might have free pages left
2111 * after the fairness batches are exhausted, and remote zones haven't
2112 * even been considered yet. Try once more without fairness, and
2113 * include remote zones now, before entering the slowpath and waking
2114 * kswapd: prefer spilling to a remote zone over swapping locally.
2115 */
2116 if (alloc_flags & ALLOC_FAIR) {
2117 alloc_flags &= ~ALLOC_FAIR;
2118 if (nr_fair_skipped) {
2119 zonelist_rescan = true;
2120 reset_alloc_batches(preferred_zone);
2121 }
2122 if (nr_online_nodes > 1)
2123 zonelist_rescan = true;
2124 }
2125
2126 if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
2127 /* Disable zlc cache for second zonelist scan */
2128 zlc_active = 0;
2129 zonelist_rescan = true;
2130 }
2131
2132 if (zonelist_rescan)
2133 goto zonelist_scan;
2134
2135 return NULL;
2092} 2136}
2093 2137
2094/* 2138/*
@@ -2201,8 +2245,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2201{ 2245{
2202 struct page *page; 2246 struct page *page;
2203 2247
2204 /* Acquire the OOM killer lock for the zones in zonelist */ 2248 /* Acquire the per-zone oom lock for each zone */
2205 if (!try_set_zonelist_oom(zonelist, gfp_mask)) { 2249 if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
2206 schedule_timeout_uninterruptible(1); 2250 schedule_timeout_uninterruptible(1);
2207 return NULL; 2251 return NULL;
2208 } 2252 }
@@ -2240,7 +2284,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2240 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2284 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
2241 2285
2242out: 2286out:
2243 clear_zonelist_oom(zonelist, gfp_mask); 2287 oom_zonelist_unlock(zonelist, gfp_mask);
2244 return page; 2288 return page;
2245} 2289}
2246 2290
@@ -2251,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2251 struct zonelist *zonelist, enum zone_type high_zoneidx, 2295 struct zonelist *zonelist, enum zone_type high_zoneidx,
2252 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2296 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2253 int classzone_idx, int migratetype, enum migrate_mode mode, 2297 int classzone_idx, int migratetype, enum migrate_mode mode,
2254 bool *contended_compaction, bool *deferred_compaction, 2298 int *contended_compaction, bool *deferred_compaction)
2255 unsigned long *did_some_progress)
2256{ 2299{
2257 if (!order) 2300 struct zone *last_compact_zone = NULL;
2258 return NULL; 2301 unsigned long compact_result;
2302 struct page *page;
2259 2303
2260 if (compaction_deferred(preferred_zone, order)) { 2304 if (!order)
2261 *deferred_compaction = true;
2262 return NULL; 2305 return NULL;
2263 }
2264 2306
2265 current->flags |= PF_MEMALLOC; 2307 current->flags |= PF_MEMALLOC;
2266 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2308 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2267 nodemask, mode, 2309 nodemask, mode,
2268 contended_compaction); 2310 contended_compaction,
2311 &last_compact_zone);
2269 current->flags &= ~PF_MEMALLOC; 2312 current->flags &= ~PF_MEMALLOC;
2270 2313
2271 if (*did_some_progress != COMPACT_SKIPPED) { 2314 switch (compact_result) {
2272 struct page *page; 2315 case COMPACT_DEFERRED:
2316 *deferred_compaction = true;
2317 /* fall-through */
2318 case COMPACT_SKIPPED:
2319 return NULL;
2320 default:
2321 break;
2322 }
2273 2323
2274 /* Page migration frees to the PCP lists but we want merging */ 2324 /*
2275 drain_pages(get_cpu()); 2325 * At least in one zone compaction wasn't deferred or skipped, so let's
2276 put_cpu(); 2326 * count a compaction stall
2327 */
2328 count_vm_event(COMPACTSTALL);
2277 2329
2278 page = get_page_from_freelist(gfp_mask, nodemask, 2330 /* Page migration frees to the PCP lists but we want merging */
2279 order, zonelist, high_zoneidx, 2331 drain_pages(get_cpu());
2280 alloc_flags & ~ALLOC_NO_WATERMARKS, 2332 put_cpu();
2281 preferred_zone, classzone_idx, migratetype);
2282 if (page) {
2283 preferred_zone->compact_blockskip_flush = false;
2284 compaction_defer_reset(preferred_zone, order, true);
2285 count_vm_event(COMPACTSUCCESS);
2286 return page;
2287 }
2288 2333
2289 /* 2334 page = get_page_from_freelist(gfp_mask, nodemask,
2290 * It's bad if compaction run occurs and fails. 2335 order, zonelist, high_zoneidx,
2291 * The most likely reason is that pages exist, 2336 alloc_flags & ~ALLOC_NO_WATERMARKS,
2292 * but not enough to satisfy watermarks. 2337 preferred_zone, classzone_idx, migratetype);
2293 */
2294 count_vm_event(COMPACTFAIL);
2295 2338
2296 /* 2339 if (page) {
2297 * As async compaction considers a subset of pageblocks, only 2340 struct zone *zone = page_zone(page);
2298 * defer if the failure was a sync compaction failure.
2299 */
2300 if (mode != MIGRATE_ASYNC)
2301 defer_compaction(preferred_zone, order);
2302 2341
2303 cond_resched(); 2342 zone->compact_blockskip_flush = false;
2343 compaction_defer_reset(zone, order, true);
2344 count_vm_event(COMPACTSUCCESS);
2345 return page;
2304 } 2346 }
2305 2347
2348 /*
2349 * last_compact_zone is where try_to_compact_pages thought allocation
2350 * should succeed, so it did not defer compaction. But here we know
2351 * that it didn't succeed, so we do the defer.
2352 */
2353 if (last_compact_zone && mode != MIGRATE_ASYNC)
2354 defer_compaction(last_compact_zone, order);
2355
2356 /*
2357 * It's bad if compaction run occurs and fails. The most likely reason
2358 * is that pages exist, but not enough to satisfy watermarks.
2359 */
2360 count_vm_event(COMPACTFAIL);
2361
2362 cond_resched();
2363
2306 return NULL; 2364 return NULL;
2307} 2365}
2308#else 2366#else
@@ -2310,9 +2368,8 @@ static inline struct page *
2310__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2368__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2311 struct zonelist *zonelist, enum zone_type high_zoneidx, 2369 struct zonelist *zonelist, enum zone_type high_zoneidx,
2312 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2370 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2313 int classzone_idx, int migratetype, 2371 int classzone_idx, int migratetype, enum migrate_mode mode,
2314 enum migrate_mode mode, bool *contended_compaction, 2372 int *contended_compaction, bool *deferred_compaction)
2315 bool *deferred_compaction, unsigned long *did_some_progress)
2316{ 2373{
2317 return NULL; 2374 return NULL;
2318} 2375}
@@ -2409,37 +2466,17 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2409 return page; 2466 return page;
2410} 2467}
2411 2468
2412static void reset_alloc_batches(struct zonelist *zonelist,
2413 enum zone_type high_zoneidx,
2414 struct zone *preferred_zone)
2415{
2416 struct zoneref *z;
2417 struct zone *zone;
2418
2419 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2420 /*
2421 * Only reset the batches of zones that were actually
2422 * considered in the fairness pass, we don't want to
2423 * trash fairness information for zones that are not
2424 * actually part of this zonelist's round-robin cycle.
2425 */
2426 if (!zone_local(preferred_zone, zone))
2427 continue;
2428 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2429 high_wmark_pages(zone) - low_wmark_pages(zone) -
2430 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
2431 }
2432}
2433
2434static void wake_all_kswapds(unsigned int order, 2469static void wake_all_kswapds(unsigned int order,
2435 struct zonelist *zonelist, 2470 struct zonelist *zonelist,
2436 enum zone_type high_zoneidx, 2471 enum zone_type high_zoneidx,
2437 struct zone *preferred_zone) 2472 struct zone *preferred_zone,
2473 nodemask_t *nodemask)
2438{ 2474{
2439 struct zoneref *z; 2475 struct zoneref *z;
2440 struct zone *zone; 2476 struct zone *zone;
2441 2477
2442 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2478 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2479 high_zoneidx, nodemask)
2443 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2480 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2444} 2481}
2445 2482
@@ -2486,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2486 alloc_flags |= ALLOC_NO_WATERMARKS; 2523 alloc_flags |= ALLOC_NO_WATERMARKS;
2487 } 2524 }
2488#ifdef CONFIG_CMA 2525#ifdef CONFIG_CMA
2489 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2526 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2490 alloc_flags |= ALLOC_CMA; 2527 alloc_flags |= ALLOC_CMA;
2491#endif 2528#endif
2492 return alloc_flags; 2529 return alloc_flags;
@@ -2510,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2510 unsigned long did_some_progress; 2547 unsigned long did_some_progress;
2511 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2548 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2512 bool deferred_compaction = false; 2549 bool deferred_compaction = false;
2513 bool contended_compaction = false; 2550 int contended_compaction = COMPACT_CONTENDED_NONE;
2514 2551
2515 /* 2552 /*
2516 * In the slowpath, we sanity check order to avoid ever trying to 2553 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2537,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2537 2574
2538restart: 2575restart:
2539 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2576 if (!(gfp_mask & __GFP_NO_KSWAPD))
2540 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2577 wake_all_kswapds(order, zonelist, high_zoneidx,
2578 preferred_zone, nodemask);
2541 2579
2542 /* 2580 /*
2543 * OK, we're below the kswapd watermark and have kicked background 2581 * OK, we're below the kswapd watermark and have kicked background
@@ -2610,29 +2648,50 @@ rebalance:
2610 preferred_zone, 2648 preferred_zone,
2611 classzone_idx, migratetype, 2649 classzone_idx, migratetype,
2612 migration_mode, &contended_compaction, 2650 migration_mode, &contended_compaction,
2613 &deferred_compaction, 2651 &deferred_compaction);
2614 &did_some_progress);
2615 if (page) 2652 if (page)
2616 goto got_pg; 2653 goto got_pg;
2617 2654
2655 /* Checks for THP-specific high-order allocations */
2656 if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
2657 /*
2658 * If compaction is deferred for high-order allocations, it is
2659 * because sync compaction recently failed. If this is the case
2660 * and the caller requested a THP allocation, we do not want
2661 * to heavily disrupt the system, so we fail the allocation
2662 * instead of entering direct reclaim.
2663 */
2664 if (deferred_compaction)
2665 goto nopage;
2666
2667 /*
2668 * In all zones where compaction was attempted (and not
2669 * deferred or skipped), lock contention has been detected.
2670 * For THP allocation we do not want to disrupt the others
2671 * so we fallback to base pages instead.
2672 */
2673 if (contended_compaction == COMPACT_CONTENDED_LOCK)
2674 goto nopage;
2675
2676 /*
2677 * If compaction was aborted due to need_resched(), we do not
2678 * want to further increase allocation latency, unless it is
2679 * khugepaged trying to collapse.
2680 */
2681 if (contended_compaction == COMPACT_CONTENDED_SCHED
2682 && !(current->flags & PF_KTHREAD))
2683 goto nopage;
2684 }
2685
2618 /* 2686 /*
2619 * It can become very expensive to allocate transparent hugepages at 2687 * It can become very expensive to allocate transparent hugepages at
2620 * fault, so use asynchronous memory compaction for THP unless it is 2688 * fault, so use asynchronous memory compaction for THP unless it is
2621 * khugepaged trying to collapse. 2689 * khugepaged trying to collapse.
2622 */ 2690 */
2623 if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) 2691 if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
2692 (current->flags & PF_KTHREAD))
2624 migration_mode = MIGRATE_SYNC_LIGHT; 2693 migration_mode = MIGRATE_SYNC_LIGHT;
2625 2694
2626 /*
2627 * If compaction is deferred for high-order allocations, it is because
2628 * sync compaction recently failed. In this is the case and the caller
2629 * requested a movable allocation that does not heavily disrupt the
2630 * system then fail the allocation instead of entering direct reclaim.
2631 */
2632 if ((deferred_compaction || contended_compaction) &&
2633 (gfp_mask & __GFP_NO_KSWAPD))
2634 goto nopage;
2635
2636 /* Try direct reclaim and then allocating */ 2695 /* Try direct reclaim and then allocating */
2637 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2696 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2638 zonelist, high_zoneidx, 2697 zonelist, high_zoneidx,
@@ -2702,8 +2761,7 @@ rebalance:
2702 preferred_zone, 2761 preferred_zone,
2703 classzone_idx, migratetype, 2762 classzone_idx, migratetype,
2704 migration_mode, &contended_compaction, 2763 migration_mode, &contended_compaction,
2705 &deferred_compaction, 2764 &deferred_compaction);
2706 &did_some_progress);
2707 if (page) 2765 if (page)
2708 goto got_pg; 2766 goto got_pg;
2709 } 2767 }
@@ -2729,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2729 struct zone *preferred_zone; 2787 struct zone *preferred_zone;
2730 struct zoneref *preferred_zoneref; 2788 struct zoneref *preferred_zoneref;
2731 struct page *page = NULL; 2789 struct page *page = NULL;
2732 int migratetype = allocflags_to_migratetype(gfp_mask); 2790 int migratetype = gfpflags_to_migratetype(gfp_mask);
2733 unsigned int cpuset_mems_cookie; 2791 unsigned int cpuset_mems_cookie;
2734 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2792 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2735 int classzone_idx; 2793 int classzone_idx;
@@ -2751,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2751 if (unlikely(!zonelist->_zonerefs->zone)) 2809 if (unlikely(!zonelist->_zonerefs->zone))
2752 return NULL; 2810 return NULL;
2753 2811
2812 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
2813 alloc_flags |= ALLOC_CMA;
2814
2754retry_cpuset: 2815retry_cpuset:
2755 cpuset_mems_cookie = read_mems_allowed_begin(); 2816 cpuset_mems_cookie = read_mems_allowed_begin();
2756 2817
@@ -2762,33 +2823,12 @@ retry_cpuset:
2762 goto out; 2823 goto out;
2763 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2824 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2764 2825
2765#ifdef CONFIG_CMA
2766 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2767 alloc_flags |= ALLOC_CMA;
2768#endif
2769retry:
2770 /* First allocation attempt */ 2826 /* First allocation attempt */
2771 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2827 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2772 zonelist, high_zoneidx, alloc_flags, 2828 zonelist, high_zoneidx, alloc_flags,
2773 preferred_zone, classzone_idx, migratetype); 2829 preferred_zone, classzone_idx, migratetype);
2774 if (unlikely(!page)) { 2830 if (unlikely(!page)) {
2775 /* 2831 /*
2776 * The first pass makes sure allocations are spread
2777 * fairly within the local node. However, the local
2778 * node might have free pages left after the fairness
2779 * batches are exhausted, and remote zones haven't
2780 * even been considered yet. Try once more without
2781 * fairness, and include remote zones now, before
2782 * entering the slowpath and waking kswapd: prefer
2783 * spilling to a remote zone over swapping locally.
2784 */
2785 if (alloc_flags & ALLOC_FAIR) {
2786 reset_alloc_batches(zonelist, high_zoneidx,
2787 preferred_zone);
2788 alloc_flags &= ~ALLOC_FAIR;
2789 goto retry;
2790 }
2791 /*
2792 * Runtime PM, block IO and its error handling path 2832 * Runtime PM, block IO and its error handling path
2793 * can deadlock because I/O on the device might not 2833 * can deadlock because I/O on the device might not
2794 * complete. 2834 * complete.
@@ -2962,7 +3002,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
2962 * Note this is not alloc_pages_exact_node() which allocates on a specific node, 3002 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2963 * but is not exact. 3003 * but is not exact.
2964 */ 3004 */
2965void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) 3005void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2966{ 3006{
2967 unsigned order = get_order(size); 3007 unsigned order = get_order(size);
2968 struct page *p = alloc_pages_node(nid, gfp_mask, order); 3008 struct page *p = alloc_pages_node(nid, gfp_mask, order);
@@ -2970,7 +3010,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2970 return NULL; 3010 return NULL;
2971 return make_alloc_exact((unsigned long)page_address(p), order, size); 3011 return make_alloc_exact((unsigned long)page_address(p), order, size);
2972} 3012}
2973EXPORT_SYMBOL(alloc_pages_exact_nid);
2974 3013
2975/** 3014/**
2976 * free_pages_exact - release memory allocated via alloc_pages_exact() 3015 * free_pages_exact - release memory allocated via alloc_pages_exact()
@@ -3052,7 +3091,7 @@ static inline void show_node(struct zone *zone)
3052void si_meminfo(struct sysinfo *val) 3091void si_meminfo(struct sysinfo *val)
3053{ 3092{
3054 val->totalram = totalram_pages; 3093 val->totalram = totalram_pages;
3055 val->sharedram = 0; 3094 val->sharedram = global_page_state(NR_SHMEM);
3056 val->freeram = global_page_state(NR_FREE_PAGES); 3095 val->freeram = global_page_state(NR_FREE_PAGES);
3057 val->bufferram = nr_blockdev_pages(); 3096 val->bufferram = nr_blockdev_pages();
3058 val->totalhigh = totalhigh_pages; 3097 val->totalhigh = totalhigh_pages;
@@ -3072,6 +3111,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
3072 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) 3111 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
3073 managed_pages += pgdat->node_zones[zone_type].managed_pages; 3112 managed_pages += pgdat->node_zones[zone_type].managed_pages;
3074 val->totalram = managed_pages; 3113 val->totalram = managed_pages;
3114 val->sharedram = node_page_state(nid, NR_SHMEM);
3075 val->freeram = node_page_state(nid, NR_FREE_PAGES); 3115 val->freeram = node_page_state(nid, NR_FREE_PAGES);
3076#ifdef CONFIG_HIGHMEM 3116#ifdef CONFIG_HIGHMEM
3077 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 3117 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3253,12 +3293,12 @@ void show_free_areas(unsigned int filter)
3253 K(zone_page_state(zone, NR_BOUNCE)), 3293 K(zone_page_state(zone, NR_BOUNCE)),
3254 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3294 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3255 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3295 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3256 zone->pages_scanned, 3296 K(zone_page_state(zone, NR_PAGES_SCANNED)),
3257 (!zone_reclaimable(zone) ? "yes" : "no") 3297 (!zone_reclaimable(zone) ? "yes" : "no")
3258 ); 3298 );
3259 printk("lowmem_reserve[]:"); 3299 printk("lowmem_reserve[]:");
3260 for (i = 0; i < MAX_NR_ZONES; i++) 3300 for (i = 0; i < MAX_NR_ZONES; i++)
3261 printk(" %lu", zone->lowmem_reserve[i]); 3301 printk(" %ld", zone->lowmem_reserve[i]);
3262 printk("\n"); 3302 printk("\n");
3263 } 3303 }
3264 3304
@@ -3572,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3572 zonelist->_zonerefs[pos].zone_idx = 0; 3612 zonelist->_zonerefs[pos].zone_idx = 0;
3573} 3613}
3574 3614
3615#if defined(CONFIG_64BIT)
3616/*
3617 * Devices that require DMA32/DMA are relatively rare and do not justify a
3618 * penalty to every machine in case the specialised case applies. Default
3619 * to Node-ordering on 64-bit NUMA machines
3620 */
3621static int default_zonelist_order(void)
3622{
3623 return ZONELIST_ORDER_NODE;
3624}
3625#else
3626/*
3627 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
3628 * by the kernel. If processes running on node 0 deplete the low memory zone
3629 * then reclaim will occur more frequency increasing stalls and potentially
3630 * be easier to OOM if a large percentage of the zone is under writeback or
3631 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
3632 * Hence, default to zone ordering on 32-bit.
3633 */
3575static int default_zonelist_order(void) 3634static int default_zonelist_order(void)
3576{ 3635{
3577 int nid, zone_type;
3578 unsigned long low_kmem_size, total_size;
3579 struct zone *z;
3580 int average_size;
3581 /*
3582 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3583 * If they are really small and used heavily, the system can fall
3584 * into OOM very easily.
3585 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3586 */
3587 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3588 low_kmem_size = 0;
3589 total_size = 0;
3590 for_each_online_node(nid) {
3591 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3592 z = &NODE_DATA(nid)->node_zones[zone_type];
3593 if (populated_zone(z)) {
3594 if (zone_type < ZONE_NORMAL)
3595 low_kmem_size += z->managed_pages;
3596 total_size += z->managed_pages;
3597 } else if (zone_type == ZONE_NORMAL) {
3598 /*
3599 * If any node has only lowmem, then node order
3600 * is preferred to allow kernel allocations
3601 * locally; otherwise, they can easily infringe
3602 * on other nodes when there is an abundance of
3603 * lowmem available to allocate from.
3604 */
3605 return ZONELIST_ORDER_NODE;
3606 }
3607 }
3608 }
3609 if (!low_kmem_size || /* there are no DMA area. */
3610 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3611 return ZONELIST_ORDER_NODE;
3612 /*
3613 * look into each node's config.
3614 * If there is a node whose DMA/DMA32 memory is very big area on
3615 * local memory, NODE_ORDER may be suitable.
3616 */
3617 average_size = total_size /
3618 (nodes_weight(node_states[N_MEMORY]) + 1);
3619 for_each_online_node(nid) {
3620 low_kmem_size = 0;
3621 total_size = 0;
3622 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3623 z = &NODE_DATA(nid)->node_zones[zone_type];
3624 if (populated_zone(z)) {
3625 if (zone_type < ZONE_NORMAL)
3626 low_kmem_size += z->present_pages;
3627 total_size += z->present_pages;
3628 }
3629 }
3630 if (low_kmem_size &&
3631 total_size > average_size && /* ignore small node */
3632 low_kmem_size > total_size * 70/100)
3633 return ZONELIST_ORDER_NODE;
3634 }
3635 return ZONELIST_ORDER_ZONE; 3636 return ZONELIST_ORDER_ZONE;
3636} 3637}
3638#endif /* CONFIG_64BIT */
3637 3639
3638static void set_zonelist_order(void) 3640static void set_zonelist_order(void)
3639{ 3641{
@@ -4969,6 +4971,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4969 pgdat->node_start_pfn = node_start_pfn; 4971 pgdat->node_start_pfn = node_start_pfn;
4970#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4972#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4971 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 4973 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4974 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
4975 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
4972#endif 4976#endif
4973 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 4977 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4974 zones_size, zholes_size); 4978 zones_size, zholes_size);
@@ -5579,7 +5583,7 @@ static void calculate_totalreserve_pages(void)
5579 for_each_online_pgdat(pgdat) { 5583 for_each_online_pgdat(pgdat) {
5580 for (i = 0; i < MAX_NR_ZONES; i++) { 5584 for (i = 0; i < MAX_NR_ZONES; i++) {
5581 struct zone *zone = pgdat->node_zones + i; 5585 struct zone *zone = pgdat->node_zones + i;
5582 unsigned long max = 0; 5586 long max = 0;
5583 5587
5584 /* Find valid and maximum lowmem_reserve in the zone */ 5588 /* Find valid and maximum lowmem_reserve in the zone */
5585 for (j = i; j < MAX_NR_ZONES; j++) { 5589 for (j = i; j < MAX_NR_ZONES; j++) {
@@ -5694,9 +5698,8 @@ static void __setup_per_zone_wmarks(void)
5694 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5698 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5695 5699
5696 __mod_zone_page_state(zone, NR_ALLOC_BATCH, 5700 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5697 high_wmark_pages(zone) - 5701 high_wmark_pages(zone) - low_wmark_pages(zone) -
5698 low_wmark_pages(zone) - 5702 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
5699 zone_page_state(zone, NR_ALLOC_BATCH));
5700 5703
5701 setup_zone_migrate_reserve(zone); 5704 setup_zone_migrate_reserve(zone);
5702 spin_unlock_irqrestore(&zone->lock, flags); 5705 spin_unlock_irqrestore(&zone->lock, flags);
@@ -6271,8 +6274,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6271 6274
6272 if (list_empty(&cc->migratepages)) { 6275 if (list_empty(&cc->migratepages)) {
6273 cc->nr_migratepages = 0; 6276 cc->nr_migratepages = 0;
6274 pfn = isolate_migratepages_range(cc->zone, cc, 6277 pfn = isolate_migratepages_range(cc, pfn, end);
6275 pfn, end, true);
6276 if (!pfn) { 6278 if (!pfn) {
6277 ret = -EINTR; 6279 ret = -EINTR;
6278 break; 6280 break;
@@ -6548,97 +6550,3 @@ bool is_free_buddy_page(struct page *page)
6548 return order < MAX_ORDER; 6550 return order < MAX_ORDER;
6549} 6551}
6550#endif 6552#endif
6551
6552static const struct trace_print_flags pageflag_names[] = {
6553 {1UL << PG_locked, "locked" },
6554 {1UL << PG_error, "error" },
6555 {1UL << PG_referenced, "referenced" },
6556 {1UL << PG_uptodate, "uptodate" },
6557 {1UL << PG_dirty, "dirty" },
6558 {1UL << PG_lru, "lru" },
6559 {1UL << PG_active, "active" },
6560 {1UL << PG_slab, "slab" },
6561 {1UL << PG_owner_priv_1, "owner_priv_1" },
6562 {1UL << PG_arch_1, "arch_1" },
6563 {1UL << PG_reserved, "reserved" },
6564 {1UL << PG_private, "private" },
6565 {1UL << PG_private_2, "private_2" },
6566 {1UL << PG_writeback, "writeback" },
6567#ifdef CONFIG_PAGEFLAGS_EXTENDED
6568 {1UL << PG_head, "head" },
6569 {1UL << PG_tail, "tail" },
6570#else
6571 {1UL << PG_compound, "compound" },
6572#endif
6573 {1UL << PG_swapcache, "swapcache" },
6574 {1UL << PG_mappedtodisk, "mappedtodisk" },
6575 {1UL << PG_reclaim, "reclaim" },
6576 {1UL << PG_swapbacked, "swapbacked" },
6577 {1UL << PG_unevictable, "unevictable" },
6578#ifdef CONFIG_MMU
6579 {1UL << PG_mlocked, "mlocked" },
6580#endif
6581#ifdef CONFIG_ARCH_USES_PG_UNCACHED
6582 {1UL << PG_uncached, "uncached" },
6583#endif
6584#ifdef CONFIG_MEMORY_FAILURE
6585 {1UL << PG_hwpoison, "hwpoison" },
6586#endif
6587#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6588 {1UL << PG_compound_lock, "compound_lock" },
6589#endif
6590};
6591
6592static void dump_page_flags(unsigned long flags)
6593{
6594 const char *delim = "";
6595 unsigned long mask;
6596 int i;
6597
6598 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6599
6600 printk(KERN_ALERT "page flags: %#lx(", flags);
6601
6602 /* remove zone id */
6603 flags &= (1UL << NR_PAGEFLAGS) - 1;
6604
6605 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6606
6607 mask = pageflag_names[i].mask;
6608 if ((flags & mask) != mask)
6609 continue;
6610
6611 flags &= ~mask;
6612 printk("%s%s", delim, pageflag_names[i].name);
6613 delim = "|";
6614 }
6615
6616 /* check for left over flags */
6617 if (flags)
6618 printk("%s%#lx", delim, flags);
6619
6620 printk(")\n");
6621}
6622
6623void dump_page_badflags(struct page *page, const char *reason,
6624 unsigned long badflags)
6625{
6626 printk(KERN_ALERT
6627 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6628 page, atomic_read(&page->_count), page_mapcount(page),
6629 page->mapping, page->index);
6630 dump_page_flags(page->flags);
6631 if (reason)
6632 pr_alert("page dumped because: %s\n", reason);
6633 if (page->flags & badflags) {
6634 pr_alert("bad because of flags:\n");
6635 dump_page_flags(page->flags & badflags);
6636 }
6637 mem_cgroup_print_bad_page(page);
6638}
6639
6640void dump_page(struct page *page, const char *reason)
6641{
6642 dump_page_badflags(page, reason, 0);
6643}
6644EXPORT_SYMBOL(dump_page);