aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c667
1 files changed, 408 insertions, 259 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3e0c69a97b7..eaa64d2ffdc5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,12 +55,13 @@
55#include <linux/kmemleak.h> 55#include <linux/kmemleak.h>
56#include <linux/compaction.h> 56#include <linux/compaction.h>
57#include <trace/events/kmem.h> 57#include <trace/events/kmem.h>
58#include <trace/events/oom.h>
58#include <linux/prefetch.h> 59#include <linux/prefetch.h>
59#include <linux/mm_inline.h> 60#include <linux/mm_inline.h>
60#include <linux/migrate.h> 61#include <linux/migrate.h>
61#include <linux/page_ext.h>
62#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/sched/mm.h>
64#include <linux/page_owner.h> 65#include <linux/page_owner.h>
65#include <linux/kthread.h> 66#include <linux/kthread.h>
66#include <linux/memcontrol.h> 67#include <linux/memcontrol.h>
@@ -91,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
91int _node_numa_mem_[MAX_NUMNODES]; 92int _node_numa_mem_[MAX_NUMNODES];
92#endif 93#endif
93 94
95/* work_structs for global per-cpu drains */
96DEFINE_MUTEX(pcpu_drain_mutex);
97DEFINE_PER_CPU(struct work_struct, pcpu_drain);
98
94#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 99#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
95volatile unsigned long latent_entropy __latent_entropy; 100volatile unsigned long latent_entropy __latent_entropy;
96EXPORT_SYMBOL(latent_entropy); 101EXPORT_SYMBOL(latent_entropy);
@@ -714,7 +719,7 @@ static inline void rmv_page_order(struct page *page)
714/* 719/*
715 * This function checks whether a page is free && is the buddy 720 * This function checks whether a page is free && is the buddy
716 * we can do coalesce a page and its buddy if 721 * we can do coalesce a page and its buddy if
717 * (a) the buddy is not in a hole && 722 * (a) the buddy is not in a hole (check before calling!) &&
718 * (b) the buddy is in the buddy system && 723 * (b) the buddy is in the buddy system &&
719 * (c) a page and its buddy have the same order && 724 * (c) a page and its buddy have the same order &&
720 * (d) a page and its buddy are in the same zone. 725 * (d) a page and its buddy are in the same zone.
@@ -729,9 +734,6 @@ static inline void rmv_page_order(struct page *page)
729static inline int page_is_buddy(struct page *page, struct page *buddy, 734static inline int page_is_buddy(struct page *page, struct page *buddy,
730 unsigned int order) 735 unsigned int order)
731{ 736{
732 if (!pfn_valid_within(page_to_pfn(buddy)))
733 return 0;
734
735 if (page_is_guard(buddy) && page_order(buddy) == order) { 737 if (page_is_guard(buddy) && page_order(buddy) == order) {
736 if (page_zone_id(page) != page_zone_id(buddy)) 738 if (page_zone_id(page) != page_zone_id(buddy))
737 return 0; 739 return 0;
@@ -787,9 +789,8 @@ static inline void __free_one_page(struct page *page,
787 struct zone *zone, unsigned int order, 789 struct zone *zone, unsigned int order,
788 int migratetype) 790 int migratetype)
789{ 791{
790 unsigned long page_idx; 792 unsigned long combined_pfn;
791 unsigned long combined_idx; 793 unsigned long uninitialized_var(buddy_pfn);
792 unsigned long uninitialized_var(buddy_idx);
793 struct page *buddy; 794 struct page *buddy;
794 unsigned int max_order; 795 unsigned int max_order;
795 796
@@ -802,15 +803,16 @@ static inline void __free_one_page(struct page *page,
802 if (likely(!is_migrate_isolate(migratetype))) 803 if (likely(!is_migrate_isolate(migratetype)))
803 __mod_zone_freepage_state(zone, 1 << order, migratetype); 804 __mod_zone_freepage_state(zone, 1 << order, migratetype);
804 805
805 page_idx = pfn & ((1 << MAX_ORDER) - 1); 806 VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
806
807 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
808 VM_BUG_ON_PAGE(bad_range(zone, page), page); 807 VM_BUG_ON_PAGE(bad_range(zone, page), page);
809 808
810continue_merging: 809continue_merging:
811 while (order < max_order - 1) { 810 while (order < max_order - 1) {
812 buddy_idx = __find_buddy_index(page_idx, order); 811 buddy_pfn = __find_buddy_pfn(pfn, order);
813 buddy = page + (buddy_idx - page_idx); 812 buddy = page + (buddy_pfn - pfn);
813
814 if (!pfn_valid_within(buddy_pfn))
815 goto done_merging;
814 if (!page_is_buddy(page, buddy, order)) 816 if (!page_is_buddy(page, buddy, order))
815 goto done_merging; 817 goto done_merging;
816 /* 818 /*
@@ -824,9 +826,9 @@ continue_merging:
824 zone->free_area[order].nr_free--; 826 zone->free_area[order].nr_free--;
825 rmv_page_order(buddy); 827 rmv_page_order(buddy);
826 } 828 }
827 combined_idx = buddy_idx & page_idx; 829 combined_pfn = buddy_pfn & pfn;
828 page = page + (combined_idx - page_idx); 830 page = page + (combined_pfn - pfn);
829 page_idx = combined_idx; 831 pfn = combined_pfn;
830 order++; 832 order++;
831 } 833 }
832 if (max_order < MAX_ORDER) { 834 if (max_order < MAX_ORDER) {
@@ -841,8 +843,8 @@ continue_merging:
841 if (unlikely(has_isolate_pageblock(zone))) { 843 if (unlikely(has_isolate_pageblock(zone))) {
842 int buddy_mt; 844 int buddy_mt;
843 845
844 buddy_idx = __find_buddy_index(page_idx, order); 846 buddy_pfn = __find_buddy_pfn(pfn, order);
845 buddy = page + (buddy_idx - page_idx); 847 buddy = page + (buddy_pfn - pfn);
846 buddy_mt = get_pageblock_migratetype(buddy); 848 buddy_mt = get_pageblock_migratetype(buddy);
847 849
848 if (migratetype != buddy_mt 850 if (migratetype != buddy_mt
@@ -865,12 +867,12 @@ done_merging:
865 * so it's less likely to be used soon and more likely to be merged 867 * so it's less likely to be used soon and more likely to be merged
866 * as a higher order page 868 * as a higher order page
867 */ 869 */
868 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 870 if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
869 struct page *higher_page, *higher_buddy; 871 struct page *higher_page, *higher_buddy;
870 combined_idx = buddy_idx & page_idx; 872 combined_pfn = buddy_pfn & pfn;
871 higher_page = page + (combined_idx - page_idx); 873 higher_page = page + (combined_pfn - pfn);
872 buddy_idx = __find_buddy_index(combined_idx, order + 1); 874 buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
873 higher_buddy = higher_page + (buddy_idx - combined_idx); 875 higher_buddy = higher_page + (buddy_pfn - combined_pfn);
874 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 876 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
875 list_add_tail(&page->lru, 877 list_add_tail(&page->lru,
876 &zone->free_area[order].free_list[migratetype]); 878 &zone->free_area[order].free_list[migratetype]);
@@ -1087,10 +1089,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1087{ 1089{
1088 int migratetype = 0; 1090 int migratetype = 0;
1089 int batch_free = 0; 1091 int batch_free = 0;
1090 unsigned long nr_scanned; 1092 unsigned long nr_scanned, flags;
1091 bool isolated_pageblocks; 1093 bool isolated_pageblocks;
1092 1094
1093 spin_lock(&zone->lock); 1095 spin_lock_irqsave(&zone->lock, flags);
1094 isolated_pageblocks = has_isolate_pageblock(zone); 1096 isolated_pageblocks = has_isolate_pageblock(zone);
1095 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); 1097 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1096 if (nr_scanned) 1098 if (nr_scanned)
@@ -1139,7 +1141,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1139 trace_mm_page_pcpu_drain(page, 0, mt); 1141 trace_mm_page_pcpu_drain(page, 0, mt);
1140 } while (--count && --batch_free && !list_empty(list)); 1142 } while (--count && --batch_free && !list_empty(list));
1141 } 1143 }
1142 spin_unlock(&zone->lock); 1144 spin_unlock_irqrestore(&zone->lock, flags);
1143} 1145}
1144 1146
1145static void free_one_page(struct zone *zone, 1147static void free_one_page(struct zone *zone,
@@ -1147,8 +1149,9 @@ static void free_one_page(struct zone *zone,
1147 unsigned int order, 1149 unsigned int order,
1148 int migratetype) 1150 int migratetype)
1149{ 1151{
1150 unsigned long nr_scanned; 1152 unsigned long nr_scanned, flags;
1151 spin_lock(&zone->lock); 1153 spin_lock_irqsave(&zone->lock, flags);
1154 __count_vm_events(PGFREE, 1 << order);
1152 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); 1155 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1153 if (nr_scanned) 1156 if (nr_scanned)
1154 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); 1157 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1158,7 +1161,7 @@ static void free_one_page(struct zone *zone,
1158 migratetype = get_pfnblock_migratetype(page, pfn); 1161 migratetype = get_pfnblock_migratetype(page, pfn);
1159 } 1162 }
1160 __free_one_page(page, pfn, zone, order, migratetype); 1163 __free_one_page(page, pfn, zone, order, migratetype);
1161 spin_unlock(&zone->lock); 1164 spin_unlock_irqrestore(&zone->lock, flags);
1162} 1165}
1163 1166
1164static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1167static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1236,7 +1239,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1236 1239
1237static void __free_pages_ok(struct page *page, unsigned int order) 1240static void __free_pages_ok(struct page *page, unsigned int order)
1238{ 1241{
1239 unsigned long flags;
1240 int migratetype; 1242 int migratetype;
1241 unsigned long pfn = page_to_pfn(page); 1243 unsigned long pfn = page_to_pfn(page);
1242 1244
@@ -1244,10 +1246,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
1244 return; 1246 return;
1245 1247
1246 migratetype = get_pfnblock_migratetype(page, pfn); 1248 migratetype = get_pfnblock_migratetype(page, pfn);
1247 local_irq_save(flags);
1248 __count_vm_events(PGFREE, 1 << order);
1249 free_one_page(page_zone(page), page, pfn, order, migratetype); 1249 free_one_page(page_zone(page), page, pfn, order, migratetype);
1250 local_irq_restore(flags);
1251} 1250}
1252 1251
1253static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1252static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2219,8 +2218,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2219 int migratetype, bool cold) 2218 int migratetype, bool cold)
2220{ 2219{
2221 int i, alloced = 0; 2220 int i, alloced = 0;
2221 unsigned long flags;
2222 2222
2223 spin_lock(&zone->lock); 2223 spin_lock_irqsave(&zone->lock, flags);
2224 for (i = 0; i < count; ++i) { 2224 for (i = 0; i < count; ++i) {
2225 struct page *page = __rmqueue(zone, order, migratetype); 2225 struct page *page = __rmqueue(zone, order, migratetype);
2226 if (unlikely(page == NULL)) 2226 if (unlikely(page == NULL))
@@ -2256,7 +2256,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2256 * pages added to the pcp list. 2256 * pages added to the pcp list.
2257 */ 2257 */
2258 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2258 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2259 spin_unlock(&zone->lock); 2259 spin_unlock_irqrestore(&zone->lock, flags);
2260 return alloced; 2260 return alloced;
2261} 2261}
2262 2262
@@ -2341,16 +2341,26 @@ void drain_local_pages(struct zone *zone)
2341 drain_pages(cpu); 2341 drain_pages(cpu);
2342} 2342}
2343 2343
2344static void drain_local_pages_wq(struct work_struct *work)
2345{
2346 /*
2347 * drain_all_pages doesn't use proper cpu hotplug protection so
2348 * we can race with cpu offline when the WQ can move this from
2349 * a cpu pinned worker to an unbound one. We can operate on a different
2350 * cpu which is allright but we also have to make sure to not move to
2351 * a different one.
2352 */
2353 preempt_disable();
2354 drain_local_pages(NULL);
2355 preempt_enable();
2356}
2357
2344/* 2358/*
2345 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2359 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2346 * 2360 *
2347 * When zone parameter is non-NULL, spill just the single zone's pages. 2361 * When zone parameter is non-NULL, spill just the single zone's pages.
2348 * 2362 *
2349 * Note that this code is protected against sending an IPI to an offline 2363 * Note that this can be extremely slow as the draining happens in a workqueue.
2350 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
2351 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
2352 * nothing keeps CPUs from showing up after we populated the cpumask and
2353 * before the call to on_each_cpu_mask().
2354 */ 2364 */
2355void drain_all_pages(struct zone *zone) 2365void drain_all_pages(struct zone *zone)
2356{ 2366{
@@ -2362,6 +2372,21 @@ void drain_all_pages(struct zone *zone)
2362 */ 2372 */
2363 static cpumask_t cpus_with_pcps; 2373 static cpumask_t cpus_with_pcps;
2364 2374
2375 /* Workqueues cannot recurse */
2376 if (current->flags & PF_WQ_WORKER)
2377 return;
2378
2379 /*
2380 * Do not drain if one is already in progress unless it's specific to
2381 * a zone. Such callers are primarily CMA and memory hotplug and need
2382 * the drain to be complete when the call returns.
2383 */
2384 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2385 if (!zone)
2386 return;
2387 mutex_lock(&pcpu_drain_mutex);
2388 }
2389
2365 /* 2390 /*
2366 * We don't care about racing with CPU hotplug event 2391 * We don't care about racing with CPU hotplug event
2367 * as offline notification will cause the notified 2392 * as offline notification will cause the notified
@@ -2392,8 +2417,16 @@ void drain_all_pages(struct zone *zone)
2392 else 2417 else
2393 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2418 cpumask_clear_cpu(cpu, &cpus_with_pcps);
2394 } 2419 }
2395 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, 2420
2396 zone, 1); 2421 for_each_cpu(cpu, &cpus_with_pcps) {
2422 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2423 INIT_WORK(work, drain_local_pages_wq);
2424 schedule_work_on(cpu, work);
2425 }
2426 for_each_cpu(cpu, &cpus_with_pcps)
2427 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2428
2429 mutex_unlock(&pcpu_drain_mutex);
2397} 2430}
2398 2431
2399#ifdef CONFIG_HIBERNATION 2432#ifdef CONFIG_HIBERNATION
@@ -2444,17 +2477,20 @@ void free_hot_cold_page(struct page *page, bool cold)
2444{ 2477{
2445 struct zone *zone = page_zone(page); 2478 struct zone *zone = page_zone(page);
2446 struct per_cpu_pages *pcp; 2479 struct per_cpu_pages *pcp;
2447 unsigned long flags;
2448 unsigned long pfn = page_to_pfn(page); 2480 unsigned long pfn = page_to_pfn(page);
2449 int migratetype; 2481 int migratetype;
2450 2482
2483 if (in_interrupt()) {
2484 __free_pages_ok(page, 0);
2485 return;
2486 }
2487
2451 if (!free_pcp_prepare(page)) 2488 if (!free_pcp_prepare(page))
2452 return; 2489 return;
2453 2490
2454 migratetype = get_pfnblock_migratetype(page, pfn); 2491 migratetype = get_pfnblock_migratetype(page, pfn);
2455 set_pcppage_migratetype(page, migratetype); 2492 set_pcppage_migratetype(page, migratetype);
2456 local_irq_save(flags); 2493 preempt_disable();
2457 __count_vm_event(PGFREE);
2458 2494
2459 /* 2495 /*
2460 * We only track unmovable, reclaimable and movable on pcp lists. 2496 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2471,6 +2507,7 @@ void free_hot_cold_page(struct page *page, bool cold)
2471 migratetype = MIGRATE_MOVABLE; 2507 migratetype = MIGRATE_MOVABLE;
2472 } 2508 }
2473 2509
2510 __count_vm_event(PGFREE);
2474 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2511 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2475 if (!cold) 2512 if (!cold)
2476 list_add(&page->lru, &pcp->lists[migratetype]); 2513 list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2484,7 +2521,7 @@ void free_hot_cold_page(struct page *page, bool cold)
2484 } 2521 }
2485 2522
2486out: 2523out:
2487 local_irq_restore(flags); 2524 preempt_enable();
2488} 2525}
2489 2526
2490/* 2527/*
@@ -2602,74 +2639,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2602#endif 2639#endif
2603} 2640}
2604 2641
2642/* Remove page from the per-cpu list, caller must protect the list */
2643static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2644 bool cold, struct per_cpu_pages *pcp,
2645 struct list_head *list)
2646{
2647 struct page *page;
2648
2649 VM_BUG_ON(in_interrupt());
2650
2651 do {
2652 if (list_empty(list)) {
2653 pcp->count += rmqueue_bulk(zone, 0,
2654 pcp->batch, list,
2655 migratetype, cold);
2656 if (unlikely(list_empty(list)))
2657 return NULL;
2658 }
2659
2660 if (cold)
2661 page = list_last_entry(list, struct page, lru);
2662 else
2663 page = list_first_entry(list, struct page, lru);
2664
2665 list_del(&page->lru);
2666 pcp->count--;
2667 } while (check_new_pcp(page));
2668
2669 return page;
2670}
2671
2672/* Lock and remove page from the per-cpu list */
2673static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2674 struct zone *zone, unsigned int order,
2675 gfp_t gfp_flags, int migratetype)
2676{
2677 struct per_cpu_pages *pcp;
2678 struct list_head *list;
2679 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2680 struct page *page;
2681
2682 preempt_disable();
2683 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2684 list = &pcp->lists[migratetype];
2685 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
2686 if (page) {
2687 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2688 zone_statistics(preferred_zone, zone);
2689 }
2690 preempt_enable();
2691 return page;
2692}
2693
2605/* 2694/*
2606 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2695 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2607 */ 2696 */
2608static inline 2697static inline
2609struct page *buffered_rmqueue(struct zone *preferred_zone, 2698struct page *rmqueue(struct zone *preferred_zone,
2610 struct zone *zone, unsigned int order, 2699 struct zone *zone, unsigned int order,
2611 gfp_t gfp_flags, unsigned int alloc_flags, 2700 gfp_t gfp_flags, unsigned int alloc_flags,
2612 int migratetype) 2701 int migratetype)
2613{ 2702{
2614 unsigned long flags; 2703 unsigned long flags;
2615 struct page *page; 2704 struct page *page;
2616 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2617 2705
2618 if (likely(order == 0)) { 2706 if (likely(order == 0) && !in_interrupt()) {
2619 struct per_cpu_pages *pcp; 2707 page = rmqueue_pcplist(preferred_zone, zone, order,
2620 struct list_head *list; 2708 gfp_flags, migratetype);
2621 2709 goto out;
2622 local_irq_save(flags); 2710 }
2623 do {
2624 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2625 list = &pcp->lists[migratetype];
2626 if (list_empty(list)) {
2627 pcp->count += rmqueue_bulk(zone, 0,
2628 pcp->batch, list,
2629 migratetype, cold);
2630 if (unlikely(list_empty(list)))
2631 goto failed;
2632 }
2633
2634 if (cold)
2635 page = list_last_entry(list, struct page, lru);
2636 else
2637 page = list_first_entry(list, struct page, lru);
2638
2639 list_del(&page->lru);
2640 pcp->count--;
2641 2711
2642 } while (check_new_pcp(page)); 2712 /*
2643 } else { 2713 * We most definitely don't want callers attempting to
2644 /* 2714 * allocate greater than order-1 page units with __GFP_NOFAIL.
2645 * We most definitely don't want callers attempting to 2715 */
2646 * allocate greater than order-1 page units with __GFP_NOFAIL. 2716 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2647 */ 2717 spin_lock_irqsave(&zone->lock, flags);
2648 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2649 spin_lock_irqsave(&zone->lock, flags);
2650 2718
2651 do { 2719 do {
2652 page = NULL; 2720 page = NULL;
2653 if (alloc_flags & ALLOC_HARDER) { 2721 if (alloc_flags & ALLOC_HARDER) {
2654 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2722 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2655 if (page) 2723 if (page)
2656 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2724 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2657 } 2725 }
2658 if (!page)
2659 page = __rmqueue(zone, order, migratetype);
2660 } while (page && check_new_pages(page, order));
2661 spin_unlock(&zone->lock);
2662 if (!page) 2726 if (!page)
2663 goto failed; 2727 page = __rmqueue(zone, order, migratetype);
2664 __mod_zone_freepage_state(zone, -(1 << order), 2728 } while (page && check_new_pages(page, order));
2665 get_pcppage_migratetype(page)); 2729 spin_unlock(&zone->lock);
2666 } 2730 if (!page)
2731 goto failed;
2732 __mod_zone_freepage_state(zone, -(1 << order),
2733 get_pcppage_migratetype(page));
2667 2734
2668 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2735 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2669 zone_statistics(preferred_zone, zone); 2736 zone_statistics(preferred_zone, zone);
2670 local_irq_restore(flags); 2737 local_irq_restore(flags);
2671 2738
2672 VM_BUG_ON_PAGE(bad_range(zone, page), page); 2739out:
2740 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2673 return page; 2741 return page;
2674 2742
2675failed: 2743failed:
@@ -2877,7 +2945,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
2877#ifdef CONFIG_NUMA 2945#ifdef CONFIG_NUMA
2878static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2946static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2879{ 2947{
2880 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < 2948 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
2881 RECLAIM_DISTANCE; 2949 RECLAIM_DISTANCE;
2882} 2950}
2883#else /* CONFIG_NUMA */ 2951#else /* CONFIG_NUMA */
@@ -2974,7 +3042,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2974 } 3042 }
2975 3043
2976try_this_zone: 3044try_this_zone:
2977 page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, 3045 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
2978 gfp_mask, alloc_flags, ac->migratetype); 3046 gfp_mask, alloc_flags, ac->migratetype);
2979 if (page) { 3047 if (page) {
2980 prep_new_page(page, order, gfp_mask, alloc_flags); 3048 prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3007,18 +3075,12 @@ static inline bool should_suppress_show_mem(void)
3007 return ret; 3075 return ret;
3008} 3076}
3009 3077
3010static DEFINE_RATELIMIT_STATE(nopage_rs, 3078static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3011 DEFAULT_RATELIMIT_INTERVAL,
3012 DEFAULT_RATELIMIT_BURST);
3013
3014void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
3015{ 3079{
3016 unsigned int filter = SHOW_MEM_FILTER_NODES; 3080 unsigned int filter = SHOW_MEM_FILTER_NODES;
3017 struct va_format vaf; 3081 static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3018 va_list args;
3019 3082
3020 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || 3083 if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3021 debug_guardpage_minorder() > 0)
3022 return; 3084 return;
3023 3085
3024 /* 3086 /*
@@ -3033,6 +3095,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
3033 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3095 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3034 filter &= ~SHOW_MEM_FILTER_NODES; 3096 filter &= ~SHOW_MEM_FILTER_NODES;
3035 3097
3098 show_mem(filter, nodemask);
3099}
3100
3101void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3102{
3103 struct va_format vaf;
3104 va_list args;
3105 static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3106 DEFAULT_RATELIMIT_BURST);
3107
3108 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
3109 debug_guardpage_minorder() > 0)
3110 return;
3111
3036 pr_warn("%s: ", current->comm); 3112 pr_warn("%s: ", current->comm);
3037 3113
3038 va_start(args, fmt); 3114 va_start(args, fmt);
@@ -3041,11 +3117,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
3041 pr_cont("%pV", &vaf); 3117 pr_cont("%pV", &vaf);
3042 va_end(args); 3118 va_end(args);
3043 3119
3044 pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); 3120 pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3121 if (nodemask)
3122 pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3123 else
3124 pr_cont("(null)\n");
3125
3126 cpuset_print_current_mems_allowed();
3045 3127
3046 dump_stack(); 3128 dump_stack();
3047 if (!should_suppress_show_mem()) 3129 warn_alloc_show_mem(gfp_mask, nodemask);
3048 show_mem(filter); 3130}
3131
3132static inline struct page *
3133__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3134 unsigned int alloc_flags,
3135 const struct alloc_context *ac)
3136{
3137 struct page *page;
3138
3139 page = get_page_from_freelist(gfp_mask, order,
3140 alloc_flags|ALLOC_CPUSET, ac);
3141 /*
3142 * fallback to ignore cpuset restriction if our nodes
3143 * are depleted
3144 */
3145 if (!page)
3146 page = get_page_from_freelist(gfp_mask, order,
3147 alloc_flags, ac);
3148
3149 return page;
3049} 3150}
3050 3151
3051static inline struct page * 3152static inline struct page *
@@ -3083,47 +3184,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3083 if (page) 3184 if (page)
3084 goto out; 3185 goto out;
3085 3186
3086 if (!(gfp_mask & __GFP_NOFAIL)) { 3187 /* Coredumps can quickly deplete all memory reserves */
3087 /* Coredumps can quickly deplete all memory reserves */ 3188 if (current->flags & PF_DUMPCORE)
3088 if (current->flags & PF_DUMPCORE) 3189 goto out;
3089 goto out; 3190 /* The OOM killer will not help higher order allocs */
3090 /* The OOM killer will not help higher order allocs */ 3191 if (order > PAGE_ALLOC_COSTLY_ORDER)
3091 if (order > PAGE_ALLOC_COSTLY_ORDER) 3192 goto out;
3092 goto out; 3193 /* The OOM killer does not needlessly kill tasks for lowmem */
3093 /* The OOM killer does not needlessly kill tasks for lowmem */ 3194 if (ac->high_zoneidx < ZONE_NORMAL)
3094 if (ac->high_zoneidx < ZONE_NORMAL) 3195 goto out;
3095 goto out; 3196 if (pm_suspended_storage())
3096 if (pm_suspended_storage()) 3197 goto out;
3097 goto out; 3198 /*
3098 /* 3199 * XXX: GFP_NOFS allocations should rather fail than rely on
3099 * XXX: GFP_NOFS allocations should rather fail than rely on 3200 * other request to make a forward progress.
3100 * other request to make a forward progress. 3201 * We are in an unfortunate situation where out_of_memory cannot
3101 * We are in an unfortunate situation where out_of_memory cannot 3202 * do much for this context but let's try it to at least get
3102 * do much for this context but let's try it to at least get 3203 * access to memory reserved if the current task is killed (see
3103 * access to memory reserved if the current task is killed (see 3204 * out_of_memory). Once filesystems are ready to handle allocation
3104 * out_of_memory). Once filesystems are ready to handle allocation 3205 * failures more gracefully we should just bail out here.
3105 * failures more gracefully we should just bail out here. 3206 */
3106 */ 3207
3208 /* The OOM killer may not free memory on a specific node */
3209 if (gfp_mask & __GFP_THISNODE)
3210 goto out;
3107 3211
3108 /* The OOM killer may not free memory on a specific node */
3109 if (gfp_mask & __GFP_THISNODE)
3110 goto out;
3111 }
3112 /* Exhausted what can be done so it's blamo time */ 3212 /* Exhausted what can be done so it's blamo time */
3113 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { 3213 if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3114 *did_some_progress = 1; 3214 *did_some_progress = 1;
3115 3215
3116 if (gfp_mask & __GFP_NOFAIL) { 3216 /*
3117 page = get_page_from_freelist(gfp_mask, order, 3217 * Help non-failing allocations by giving them access to memory
3118 ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); 3218 * reserves
3119 /* 3219 */
3120 * fallback to ignore cpuset restriction if our nodes 3220 if (gfp_mask & __GFP_NOFAIL)
3121 * are depleted 3221 page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3122 */
3123 if (!page)
3124 page = get_page_from_freelist(gfp_mask, order,
3125 ALLOC_NO_WATERMARKS, ac); 3222 ALLOC_NO_WATERMARKS, ac);
3126 }
3127 } 3223 }
3128out: 3224out:
3129 mutex_unlock(&oom_lock); 3225 mutex_unlock(&oom_lock);
@@ -3192,6 +3288,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3192{ 3288{
3193 int max_retries = MAX_COMPACT_RETRIES; 3289 int max_retries = MAX_COMPACT_RETRIES;
3194 int min_priority; 3290 int min_priority;
3291 bool ret = false;
3292 int retries = *compaction_retries;
3293 enum compact_priority priority = *compact_priority;
3195 3294
3196 if (!order) 3295 if (!order)
3197 return false; 3296 return false;
@@ -3213,8 +3312,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3213 * But do not retry if the given zonelist is not suitable for 3312 * But do not retry if the given zonelist is not suitable for
3214 * compaction. 3313 * compaction.
3215 */ 3314 */
3216 if (compaction_withdrawn(compact_result)) 3315 if (compaction_withdrawn(compact_result)) {
3217 return compaction_zonelist_suitable(ac, order, alloc_flags); 3316 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3317 goto out;
3318 }
3218 3319
3219 /* 3320 /*
3220 * !costly requests are much more important than __GFP_REPEAT 3321 * !costly requests are much more important than __GFP_REPEAT
@@ -3226,8 +3327,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3226 */ 3327 */
3227 if (order > PAGE_ALLOC_COSTLY_ORDER) 3328 if (order > PAGE_ALLOC_COSTLY_ORDER)
3228 max_retries /= 4; 3329 max_retries /= 4;
3229 if (*compaction_retries <= max_retries) 3330 if (*compaction_retries <= max_retries) {
3230 return true; 3331 ret = true;
3332 goto out;
3333 }
3231 3334
3232 /* 3335 /*
3233 * Make sure there are attempts at the highest priority if we exhausted 3336 * Make sure there are attempts at the highest priority if we exhausted
@@ -3236,12 +3339,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3236check_priority: 3339check_priority:
3237 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? 3340 min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3238 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; 3341 MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3342
3239 if (*compact_priority > min_priority) { 3343 if (*compact_priority > min_priority) {
3240 (*compact_priority)--; 3344 (*compact_priority)--;
3241 *compaction_retries = 0; 3345 *compaction_retries = 0;
3242 return true; 3346 ret = true;
3243 } 3347 }
3244 return false; 3348out:
3349 trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3350 return ret;
3245} 3351}
3246#else 3352#else
3247static inline struct page * 3353static inline struct page *
@@ -3464,6 +3570,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3464 ac->nodemask) { 3570 ac->nodemask) {
3465 unsigned long available; 3571 unsigned long available;
3466 unsigned long reclaimable; 3572 unsigned long reclaimable;
3573 unsigned long min_wmark = min_wmark_pages(zone);
3574 bool wmark;
3467 3575
3468 available = reclaimable = zone_reclaimable_pages(zone); 3576 available = reclaimable = zone_reclaimable_pages(zone);
3469 available -= DIV_ROUND_UP((*no_progress_loops) * available, 3577 available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3474,8 +3582,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3474 * Would the allocation succeed if we reclaimed the whole 3582 * Would the allocation succeed if we reclaimed the whole
3475 * available? 3583 * available?
3476 */ 3584 */
3477 if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), 3585 wmark = __zone_watermark_ok(zone, order, min_wmark,
3478 ac_classzone_idx(ac), alloc_flags, available)) { 3586 ac_classzone_idx(ac), alloc_flags, available);
3587 trace_reclaim_retry_zone(z, order, reclaimable,
3588 available, min_wmark, *no_progress_loops, wmark);
3589 if (wmark) {
3479 /* 3590 /*
3480 * If we didn't make any progress and have a lot of 3591 * If we didn't make any progress and have a lot of
3481 * dirty + writeback pages then we should wait for 3592 * dirty + writeback pages then we should wait for
@@ -3555,6 +3666,14 @@ retry_cpuset:
3555 no_progress_loops = 0; 3666 no_progress_loops = 0;
3556 compact_priority = DEF_COMPACT_PRIORITY; 3667 compact_priority = DEF_COMPACT_PRIORITY;
3557 cpuset_mems_cookie = read_mems_allowed_begin(); 3668 cpuset_mems_cookie = read_mems_allowed_begin();
3669
3670 /*
3671 * The fast path uses conservative alloc_flags to succeed only until
3672 * kswapd needs to be woken up, and to avoid the cost of setting up
3673 * alloc_flags precisely. So we do that now.
3674 */
3675 alloc_flags = gfp_to_alloc_flags(gfp_mask);
3676
3558 /* 3677 /*
3559 * We need to recalculate the starting point for the zonelist iterator 3678 * We need to recalculate the starting point for the zonelist iterator
3560 * because we might have used different nodemask in the fast path, or 3679 * because we might have used different nodemask in the fast path, or
@@ -3566,14 +3685,6 @@ retry_cpuset:
3566 if (!ac->preferred_zoneref->zone) 3685 if (!ac->preferred_zoneref->zone)
3567 goto nopage; 3686 goto nopage;
3568 3687
3569
3570 /*
3571 * The fast path uses conservative alloc_flags to succeed only until
3572 * kswapd needs to be woken up, and to avoid the cost of setting up
3573 * alloc_flags precisely. So we do that now.
3574 */
3575 alloc_flags = gfp_to_alloc_flags(gfp_mask);
3576
3577 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3688 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3578 wake_all_kswapds(order, ac); 3689 wake_all_kswapds(order, ac);
3579 3690
@@ -3650,35 +3761,21 @@ retry:
3650 goto got_pg; 3761 goto got_pg;
3651 3762
3652 /* Caller is not willing to reclaim, we can't balance anything */ 3763 /* Caller is not willing to reclaim, we can't balance anything */
3653 if (!can_direct_reclaim) { 3764 if (!can_direct_reclaim)
3654 /*
3655 * All existing users of the __GFP_NOFAIL are blockable, so warn
3656 * of any new users that actually allow this type of allocation
3657 * to fail.
3658 */
3659 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
3660 goto nopage; 3765 goto nopage;
3661 }
3662 3766
3663 /* Avoid recursion of direct reclaim */ 3767 /* Make sure we know about allocations which stall for too long */
3664 if (current->flags & PF_MEMALLOC) { 3768 if (time_after(jiffies, alloc_start + stall_timeout)) {
3665 /* 3769 warn_alloc(gfp_mask, ac->nodemask,
3666 * __GFP_NOFAIL request from this context is rather bizarre 3770 "page allocation stalls for %ums, order:%u",
3667 * because we cannot reclaim anything and only can loop waiting 3771 jiffies_to_msecs(jiffies-alloc_start), order);
3668 * for somebody to do a work for us. 3772 stall_timeout += 10 * HZ;
3669 */
3670 if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3671 cond_resched();
3672 goto retry;
3673 }
3674 goto nopage;
3675 } 3773 }
3676 3774
3677 /* Avoid allocations with no watermarks from looping endlessly */ 3775 /* Avoid recursion of direct reclaim */
3678 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 3776 if (current->flags & PF_MEMALLOC)
3679 goto nopage; 3777 goto nopage;
3680 3778
3681
3682 /* Try direct reclaim and then allocating */ 3779 /* Try direct reclaim and then allocating */
3683 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, 3780 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
3684 &did_some_progress); 3781 &did_some_progress);
@@ -3702,14 +3799,6 @@ retry:
3702 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) 3799 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
3703 goto nopage; 3800 goto nopage;
3704 3801
3705 /* Make sure we know about allocations which stall for too long */
3706 if (time_after(jiffies, alloc_start + stall_timeout)) {
3707 warn_alloc(gfp_mask,
3708 "page allocation stalls for %ums, order:%u",
3709 jiffies_to_msecs(jiffies-alloc_start), order);
3710 stall_timeout += 10 * HZ;
3711 }
3712
3713 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, 3802 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
3714 did_some_progress > 0, &no_progress_loops)) 3803 did_some_progress > 0, &no_progress_loops))
3715 goto retry; 3804 goto retry;
@@ -3738,6 +3827,10 @@ retry:
3738 if (page) 3827 if (page)
3739 goto got_pg; 3828 goto got_pg;
3740 3829
3830 /* Avoid allocations with no watermarks from looping endlessly */
3831 if (test_thread_flag(TIF_MEMDIE))
3832 goto nopage;
3833
3741 /* Retry as long as the OOM killer is making progress */ 3834 /* Retry as long as the OOM killer is making progress */
3742 if (did_some_progress) { 3835 if (did_some_progress) {
3743 no_progress_loops = 0; 3836 no_progress_loops = 0;
@@ -3755,82 +3848,123 @@ nopage:
3755 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3848 if (read_mems_allowed_retry(cpuset_mems_cookie))
3756 goto retry_cpuset; 3849 goto retry_cpuset;
3757 3850
3758 warn_alloc(gfp_mask, 3851 /*
3852 * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
3853 * we always retry
3854 */
3855 if (gfp_mask & __GFP_NOFAIL) {
3856 /*
3857 * All existing users of the __GFP_NOFAIL are blockable, so warn
3858 * of any new users that actually require GFP_NOWAIT
3859 */
3860 if (WARN_ON_ONCE(!can_direct_reclaim))
3861 goto fail;
3862
3863 /*
3864 * PF_MEMALLOC request from this context is rather bizarre
3865 * because we cannot reclaim anything and only can loop waiting
3866 * for somebody to do a work for us
3867 */
3868 WARN_ON_ONCE(current->flags & PF_MEMALLOC);
3869
3870 /*
3871 * non failing costly orders are a hard requirement which we
3872 * are not prepared for much so let's warn about these users
3873 * so that we can identify them and convert them to something
3874 * else.
3875 */
3876 WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
3877
3878 /*
3879 * Help non-failing allocations by giving them access to memory
3880 * reserves but do not use ALLOC_NO_WATERMARKS because this
3881 * could deplete whole memory reserves which would just make
3882 * the situation worse
3883 */
3884 page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
3885 if (page)
3886 goto got_pg;
3887
3888 cond_resched();
3889 goto retry;
3890 }
3891fail:
3892 warn_alloc(gfp_mask, ac->nodemask,
3759 "page allocation failure: order:%u", order); 3893 "page allocation failure: order:%u", order);
3760got_pg: 3894got_pg:
3761 return page; 3895 return page;
3762} 3896}
3763 3897
3764/* 3898static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
3765 * This is the 'heart' of the zoned buddy allocator. 3899 struct zonelist *zonelist, nodemask_t *nodemask,
3766 */ 3900 struct alloc_context *ac, gfp_t *alloc_mask,
3767struct page * 3901 unsigned int *alloc_flags)
3768__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3769 struct zonelist *zonelist, nodemask_t *nodemask)
3770{ 3902{
3771 struct page *page; 3903 ac->high_zoneidx = gfp_zone(gfp_mask);
3772 unsigned int alloc_flags = ALLOC_WMARK_LOW; 3904 ac->zonelist = zonelist;
3773 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 3905 ac->nodemask = nodemask;
3774 struct alloc_context ac = { 3906 ac->migratetype = gfpflags_to_migratetype(gfp_mask);
3775 .high_zoneidx = gfp_zone(gfp_mask),
3776 .zonelist = zonelist,
3777 .nodemask = nodemask,
3778 .migratetype = gfpflags_to_migratetype(gfp_mask),
3779 };
3780 3907
3781 if (cpusets_enabled()) { 3908 if (cpusets_enabled()) {
3782 alloc_mask |= __GFP_HARDWALL; 3909 *alloc_mask |= __GFP_HARDWALL;
3783 alloc_flags |= ALLOC_CPUSET; 3910 if (!ac->nodemask)
3784 if (!ac.nodemask) 3911 ac->nodemask = &cpuset_current_mems_allowed;
3785 ac.nodemask = &cpuset_current_mems_allowed; 3912 else
3913 *alloc_flags |= ALLOC_CPUSET;
3786 } 3914 }
3787 3915
3788 gfp_mask &= gfp_allowed_mask;
3789
3790 lockdep_trace_alloc(gfp_mask); 3916 lockdep_trace_alloc(gfp_mask);
3791 3917
3792 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 3918 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
3793 3919
3794 if (should_fail_alloc_page(gfp_mask, order)) 3920 if (should_fail_alloc_page(gfp_mask, order))
3795 return NULL; 3921 return false;
3796 3922
3797 /* 3923 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
3798 * Check the zones suitable for the gfp_mask contain at least one 3924 *alloc_flags |= ALLOC_CMA;
3799 * valid zone. It's possible to have an empty zonelist as a result
3800 * of __GFP_THISNODE and a memoryless node
3801 */
3802 if (unlikely(!zonelist->_zonerefs->zone))
3803 return NULL;
3804 3925
3805 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) 3926 return true;
3806 alloc_flags |= ALLOC_CMA; 3927}
3807 3928
3929/* Determine whether to spread dirty pages and what the first usable zone */
3930static inline void finalise_ac(gfp_t gfp_mask,
3931 unsigned int order, struct alloc_context *ac)
3932{
3808 /* Dirty zone balancing only done in the fast path */ 3933 /* Dirty zone balancing only done in the fast path */
3809 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3934 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
3810 3935
3811 /* 3936 /*
3812 * The preferred zone is used for statistics but crucially it is 3937 * The preferred zone is used for statistics but crucially it is
3813 * also used as the starting point for the zonelist iterator. It 3938 * also used as the starting point for the zonelist iterator. It
3814 * may get reset for allocations that ignore memory policies. 3939 * may get reset for allocations that ignore memory policies.
3815 */ 3940 */
3816 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, 3941 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3817 ac.high_zoneidx, ac.nodemask); 3942 ac->high_zoneidx, ac->nodemask);
3818 if (!ac.preferred_zoneref->zone) { 3943}
3819 page = NULL; 3944
3820 /* 3945/*
3821 * This might be due to race with cpuset_current_mems_allowed 3946 * This is the 'heart' of the zoned buddy allocator.
3822 * update, so make sure we retry with original nodemask in the 3947 */
3823 * slow path. 3948struct page *
3824 */ 3949__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3825 goto no_zone; 3950 struct zonelist *zonelist, nodemask_t *nodemask)
3826 } 3951{
3952 struct page *page;
3953 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3954 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3955 struct alloc_context ac = { };
3956
3957 gfp_mask &= gfp_allowed_mask;
3958 if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
3959 return NULL;
3960
3961 finalise_ac(gfp_mask, order, &ac);
3827 3962
3828 /* First allocation attempt */ 3963 /* First allocation attempt */
3829 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 3964 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
3830 if (likely(page)) 3965 if (likely(page))
3831 goto out; 3966 goto out;
3832 3967
3833no_zone:
3834 /* 3968 /*
3835 * Runtime PM, block IO and its error handling path can deadlock 3969 * Runtime PM, block IO and its error handling path can deadlock
3836 * because I/O on the device might not complete. 3970 * because I/O on the device might not complete.
@@ -4252,20 +4386,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
4252 * Determine whether the node should be displayed or not, depending on whether 4386 * Determine whether the node should be displayed or not, depending on whether
4253 * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). 4387 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
4254 */ 4388 */
4255bool skip_free_areas_node(unsigned int flags, int nid) 4389static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
4256{ 4390{
4257 bool ret = false;
4258 unsigned int cpuset_mems_cookie;
4259
4260 if (!(flags & SHOW_MEM_FILTER_NODES)) 4391 if (!(flags & SHOW_MEM_FILTER_NODES))
4261 goto out; 4392 return false;
4262 4393
4263 do { 4394 /*
4264 cpuset_mems_cookie = read_mems_allowed_begin(); 4395 * no node mask - aka implicit memory numa policy. Do not bother with
4265 ret = !node_isset(nid, cpuset_current_mems_allowed); 4396 * the synchronization - read_mems_allowed_begin - because we do not
4266 } while (read_mems_allowed_retry(cpuset_mems_cookie)); 4397 * have to be precise here.
4267out: 4398 */
4268 return ret; 4399 if (!nodemask)
4400 nodemask = &cpuset_current_mems_allowed;
4401
4402 return !node_isset(nid, *nodemask);
4269} 4403}
4270 4404
4271#define K(x) ((x) << (PAGE_SHIFT-10)) 4405#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4306,7 +4440,7 @@ static void show_migration_types(unsigned char type)
4306 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's 4440 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
4307 * cpuset. 4441 * cpuset.
4308 */ 4442 */
4309void show_free_areas(unsigned int filter) 4443void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4310{ 4444{
4311 unsigned long free_pcp = 0; 4445 unsigned long free_pcp = 0;
4312 int cpu; 4446 int cpu;
@@ -4314,7 +4448,7 @@ void show_free_areas(unsigned int filter)
4314 pg_data_t *pgdat; 4448 pg_data_t *pgdat;
4315 4449
4316 for_each_populated_zone(zone) { 4450 for_each_populated_zone(zone) {
4317 if (skip_free_areas_node(filter, zone_to_nid(zone))) 4451 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4318 continue; 4452 continue;
4319 4453
4320 for_each_online_cpu(cpu) 4454 for_each_online_cpu(cpu)
@@ -4348,6 +4482,9 @@ void show_free_areas(unsigned int filter)
4348 global_page_state(NR_FREE_CMA_PAGES)); 4482 global_page_state(NR_FREE_CMA_PAGES));
4349 4483
4350 for_each_online_pgdat(pgdat) { 4484 for_each_online_pgdat(pgdat) {
4485 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
4486 continue;
4487
4351 printk("Node %d" 4488 printk("Node %d"
4352 " active_anon:%lukB" 4489 " active_anon:%lukB"
4353 " inactive_anon:%lukB" 4490 " inactive_anon:%lukB"
@@ -4397,7 +4534,7 @@ void show_free_areas(unsigned int filter)
4397 for_each_populated_zone(zone) { 4534 for_each_populated_zone(zone) {
4398 int i; 4535 int i;
4399 4536
4400 if (skip_free_areas_node(filter, zone_to_nid(zone))) 4537 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4401 continue; 4538 continue;
4402 4539
4403 free_pcp = 0; 4540 free_pcp = 0;
@@ -4462,7 +4599,7 @@ void show_free_areas(unsigned int filter)
4462 unsigned long nr[MAX_ORDER], flags, total = 0; 4599 unsigned long nr[MAX_ORDER], flags, total = 0;
4463 unsigned char types[MAX_ORDER]; 4600 unsigned char types[MAX_ORDER];
4464 4601
4465 if (skip_free_areas_node(filter, zone_to_nid(zone))) 4602 if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4466 continue; 4603 continue;
4467 show_node(zone); 4604 show_node(zone);
4468 printk(KERN_CONT "%s: ", zone->name); 4605 printk(KERN_CONT "%s: ", zone->name);
@@ -5083,8 +5220,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5083 if (context != MEMMAP_EARLY) 5220 if (context != MEMMAP_EARLY)
5084 goto not_early; 5221 goto not_early;
5085 5222
5086 if (!early_pfn_valid(pfn)) 5223 if (!early_pfn_valid(pfn)) {
5224#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5225 /*
5226 * Skip to the pfn preceding the next valid one (or
5227 * end_pfn), such that we hit a valid pfn (or end_pfn)
5228 * on our next iteration of the loop.
5229 */
5230 pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
5231#endif
5087 continue; 5232 continue;
5233 }
5088 if (!early_pfn_in_nid(pfn, nid)) 5234 if (!early_pfn_in_nid(pfn, nid))
5089 continue; 5235 continue;
5090 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) 5236 if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
@@ -5780,7 +5926,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
5780 * the zone and SPARSEMEM is in use. If there are holes within the 5926 * the zone and SPARSEMEM is in use. If there are holes within the
5781 * zone, each populated memory region may cost us one or two extra 5927 * zone, each populated memory region may cost us one or two extra
5782 * memmap pages due to alignment because memmap pages for each 5928 * memmap pages due to alignment because memmap pages for each
5783 * populated regions may not naturally algined on page boundary. 5929 * populated regions may not be naturally aligned on page boundary.
5784 * So the (present_pages >> 4) heuristic is a tradeoff for that. 5930 * So the (present_pages >> 4) heuristic is a tradeoff for that.
5785 */ 5931 */
5786 if (spanned_pages > present_pages + (present_pages >> 4) && 5932 if (spanned_pages > present_pages + (present_pages >> 4) &&
@@ -6344,8 +6490,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6344 6490
6345 start_pfn = end_pfn; 6491 start_pfn = end_pfn;
6346 } 6492 }
6347 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
6348 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
6349 6493
6350 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 6494 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
6351 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 6495 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -7081,8 +7225,9 @@ void *__init alloc_large_system_hash(const char *tablename,
7081 * If @count is not zero, it is okay to include less @count unmovable pages 7225 * If @count is not zero, it is okay to include less @count unmovable pages
7082 * 7226 *
7083 * PageLRU check without isolation or lru_lock could race so that 7227 * PageLRU check without isolation or lru_lock could race so that
7084 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 7228 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7085 * expect this function should be exact. 7229 * check without lock_page also may miss some movable non-lru pages at
7230 * race condition. So you can't expect this function should be exact.
7086 */ 7231 */
7087bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7232bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7088 bool skip_hwpoisoned_pages) 7233 bool skip_hwpoisoned_pages)
@@ -7138,6 +7283,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7138 if (skip_hwpoisoned_pages && PageHWPoison(page)) 7283 if (skip_hwpoisoned_pages && PageHWPoison(page))
7139 continue; 7284 continue;
7140 7285
7286 if (__PageMovable(page))
7287 continue;
7288
7141 if (!PageLRU(page)) 7289 if (!PageLRU(page))
7142 found++; 7290 found++;
7143 /* 7291 /*
@@ -7249,6 +7397,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7249 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 7397 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
7250 * in range must have the same migratetype and it must 7398 * in range must have the same migratetype and it must
7251 * be either of the two. 7399 * be either of the two.
7400 * @gfp_mask: GFP mask to use during compaction
7252 * 7401 *
7253 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 7402 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
7254 * aligned, however it's the caller's responsibility to guarantee that 7403 * aligned, however it's the caller's responsibility to guarantee that
@@ -7262,7 +7411,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7262 * need to be freed with free_contig_range(). 7411 * need to be freed with free_contig_range().
7263 */ 7412 */
7264int alloc_contig_range(unsigned long start, unsigned long end, 7413int alloc_contig_range(unsigned long start, unsigned long end,
7265 unsigned migratetype) 7414 unsigned migratetype, gfp_t gfp_mask)
7266{ 7415{
7267 unsigned long outer_start, outer_end; 7416 unsigned long outer_start, outer_end;
7268 unsigned int order; 7417 unsigned int order;
@@ -7274,7 +7423,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7274 .zone = page_zone(pfn_to_page(start)), 7423 .zone = page_zone(pfn_to_page(start)),
7275 .mode = MIGRATE_SYNC, 7424 .mode = MIGRATE_SYNC,
7276 .ignore_skip_hint = true, 7425 .ignore_skip_hint = true,
7277 .gfp_mask = GFP_KERNEL, 7426 .gfp_mask = memalloc_noio_flags(gfp_mask),
7278 }; 7427 };
7279 INIT_LIST_HEAD(&cc.migratepages); 7428 INIT_LIST_HEAD(&cc.migratepages);
7280 7429