aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c324
1 files changed, 198 insertions, 126 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c21b33668133..a7a6aac95a6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -59,7 +59,6 @@
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/mm_inline.h> 60#include <linux/mm_inline.h>
61#include <linux/migrate.h> 61#include <linux/migrate.h>
62#include <linux/page_ext.h>
63#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
64#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
65#include <linux/page_owner.h> 64#include <linux/page_owner.h>
@@ -92,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
92int _node_numa_mem_[MAX_NUMNODES]; 91int _node_numa_mem_[MAX_NUMNODES];
93#endif 92#endif
94 93
94/* work_structs for global per-cpu drains */
95DEFINE_MUTEX(pcpu_drain_mutex);
96DEFINE_PER_CPU(struct work_struct, pcpu_drain);
97
95#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY 98#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
96volatile unsigned long latent_entropy __latent_entropy; 99volatile unsigned long latent_entropy __latent_entropy;
97EXPORT_SYMBOL(latent_entropy); 100EXPORT_SYMBOL(latent_entropy);
@@ -1085,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1085{ 1088{
1086 int migratetype = 0; 1089 int migratetype = 0;
1087 int batch_free = 0; 1090 int batch_free = 0;
1088 unsigned long nr_scanned; 1091 unsigned long nr_scanned, flags;
1089 bool isolated_pageblocks; 1092 bool isolated_pageblocks;
1090 1093
1091 spin_lock(&zone->lock); 1094 spin_lock_irqsave(&zone->lock, flags);
1092 isolated_pageblocks = has_isolate_pageblock(zone); 1095 isolated_pageblocks = has_isolate_pageblock(zone);
1093 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); 1096 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1094 if (nr_scanned) 1097 if (nr_scanned)
@@ -1137,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1137 trace_mm_page_pcpu_drain(page, 0, mt); 1140 trace_mm_page_pcpu_drain(page, 0, mt);
1138 } while (--count && --batch_free && !list_empty(list)); 1141 } while (--count && --batch_free && !list_empty(list));
1139 } 1142 }
1140 spin_unlock(&zone->lock); 1143 spin_unlock_irqrestore(&zone->lock, flags);
1141} 1144}
1142 1145
1143static void free_one_page(struct zone *zone, 1146static void free_one_page(struct zone *zone,
@@ -1145,8 +1148,9 @@ static void free_one_page(struct zone *zone,
1145 unsigned int order, 1148 unsigned int order,
1146 int migratetype) 1149 int migratetype)
1147{ 1150{
1148 unsigned long nr_scanned; 1151 unsigned long nr_scanned, flags;
1149 spin_lock(&zone->lock); 1152 spin_lock_irqsave(&zone->lock, flags);
1153 __count_vm_events(PGFREE, 1 << order);
1150 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); 1154 nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
1151 if (nr_scanned) 1155 if (nr_scanned)
1152 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); 1156 __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1156,7 +1160,7 @@ static void free_one_page(struct zone *zone,
1156 migratetype = get_pfnblock_migratetype(page, pfn); 1160 migratetype = get_pfnblock_migratetype(page, pfn);
1157 } 1161 }
1158 __free_one_page(page, pfn, zone, order, migratetype); 1162 __free_one_page(page, pfn, zone, order, migratetype);
1159 spin_unlock(&zone->lock); 1163 spin_unlock_irqrestore(&zone->lock, flags);
1160} 1164}
1161 1165
1162static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1166static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1234,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1234 1238
1235static void __free_pages_ok(struct page *page, unsigned int order) 1239static void __free_pages_ok(struct page *page, unsigned int order)
1236{ 1240{
1237 unsigned long flags;
1238 int migratetype; 1241 int migratetype;
1239 unsigned long pfn = page_to_pfn(page); 1242 unsigned long pfn = page_to_pfn(page);
1240 1243
@@ -1242,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
1242 return; 1245 return;
1243 1246
1244 migratetype = get_pfnblock_migratetype(page, pfn); 1247 migratetype = get_pfnblock_migratetype(page, pfn);
1245 local_irq_save(flags);
1246 __count_vm_events(PGFREE, 1 << order);
1247 free_one_page(page_zone(page), page, pfn, order, migratetype); 1248 free_one_page(page_zone(page), page, pfn, order, migratetype);
1248 local_irq_restore(flags);
1249} 1249}
1250 1250
1251static void __init __free_pages_boot_core(struct page *page, unsigned int order) 1251static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2217,8 +2217,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2217 int migratetype, bool cold) 2217 int migratetype, bool cold)
2218{ 2218{
2219 int i, alloced = 0; 2219 int i, alloced = 0;
2220 unsigned long flags;
2220 2221
2221 spin_lock(&zone->lock); 2222 spin_lock_irqsave(&zone->lock, flags);
2222 for (i = 0; i < count; ++i) { 2223 for (i = 0; i < count; ++i) {
2223 struct page *page = __rmqueue(zone, order, migratetype); 2224 struct page *page = __rmqueue(zone, order, migratetype);
2224 if (unlikely(page == NULL)) 2225 if (unlikely(page == NULL))
@@ -2254,7 +2255,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2254 * pages added to the pcp list. 2255 * pages added to the pcp list.
2255 */ 2256 */
2256 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2257 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2257 spin_unlock(&zone->lock); 2258 spin_unlock_irqrestore(&zone->lock, flags);
2258 return alloced; 2259 return alloced;
2259} 2260}
2260 2261
@@ -2339,16 +2340,26 @@ void drain_local_pages(struct zone *zone)
2339 drain_pages(cpu); 2340 drain_pages(cpu);
2340} 2341}
2341 2342
2343static void drain_local_pages_wq(struct work_struct *work)
2344{
2345 /*
2346 * drain_all_pages doesn't use proper cpu hotplug protection so
2347 * we can race with cpu offline when the WQ can move this from
2348 * a cpu pinned worker to an unbound one. We can operate on a different
2349 * cpu which is allright but we also have to make sure to not move to
2350 * a different one.
2351 */
2352 preempt_disable();
2353 drain_local_pages(NULL);
2354 preempt_enable();
2355}
2356
2342/* 2357/*
2343 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 2358 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2344 * 2359 *
2345 * When zone parameter is non-NULL, spill just the single zone's pages. 2360 * When zone parameter is non-NULL, spill just the single zone's pages.
2346 * 2361 *
2347 * Note that this code is protected against sending an IPI to an offline 2362 * Note that this can be extremely slow as the draining happens in a workqueue.
2348 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
2349 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
2350 * nothing keeps CPUs from showing up after we populated the cpumask and
2351 * before the call to on_each_cpu_mask().
2352 */ 2363 */
2353void drain_all_pages(struct zone *zone) 2364void drain_all_pages(struct zone *zone)
2354{ 2365{
@@ -2360,6 +2371,21 @@ void drain_all_pages(struct zone *zone)
2360 */ 2371 */
2361 static cpumask_t cpus_with_pcps; 2372 static cpumask_t cpus_with_pcps;
2362 2373
2374 /* Workqueues cannot recurse */
2375 if (current->flags & PF_WQ_WORKER)
2376 return;
2377
2378 /*
2379 * Do not drain if one is already in progress unless it's specific to
2380 * a zone. Such callers are primarily CMA and memory hotplug and need
2381 * the drain to be complete when the call returns.
2382 */
2383 if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2384 if (!zone)
2385 return;
2386 mutex_lock(&pcpu_drain_mutex);
2387 }
2388
2363 /* 2389 /*
2364 * We don't care about racing with CPU hotplug event 2390 * We don't care about racing with CPU hotplug event
2365 * as offline notification will cause the notified 2391 * as offline notification will cause the notified
@@ -2390,8 +2416,16 @@ void drain_all_pages(struct zone *zone)
2390 else 2416 else
2391 cpumask_clear_cpu(cpu, &cpus_with_pcps); 2417 cpumask_clear_cpu(cpu, &cpus_with_pcps);
2392 } 2418 }
2393 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, 2419
2394 zone, 1); 2420 for_each_cpu(cpu, &cpus_with_pcps) {
2421 struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2422 INIT_WORK(work, drain_local_pages_wq);
2423 schedule_work_on(cpu, work);
2424 }
2425 for_each_cpu(cpu, &cpus_with_pcps)
2426 flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2427
2428 mutex_unlock(&pcpu_drain_mutex);
2395} 2429}
2396 2430
2397#ifdef CONFIG_HIBERNATION 2431#ifdef CONFIG_HIBERNATION
@@ -2442,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold)
2442{ 2476{
2443 struct zone *zone = page_zone(page); 2477 struct zone *zone = page_zone(page);
2444 struct per_cpu_pages *pcp; 2478 struct per_cpu_pages *pcp;
2445 unsigned long flags;
2446 unsigned long pfn = page_to_pfn(page); 2479 unsigned long pfn = page_to_pfn(page);
2447 int migratetype; 2480 int migratetype;
2448 2481
2482 if (in_interrupt()) {
2483 __free_pages_ok(page, 0);
2484 return;
2485 }
2486
2449 if (!free_pcp_prepare(page)) 2487 if (!free_pcp_prepare(page))
2450 return; 2488 return;
2451 2489
2452 migratetype = get_pfnblock_migratetype(page, pfn); 2490 migratetype = get_pfnblock_migratetype(page, pfn);
2453 set_pcppage_migratetype(page, migratetype); 2491 set_pcppage_migratetype(page, migratetype);
2454 local_irq_save(flags); 2492 preempt_disable();
2455 __count_vm_event(PGFREE);
2456 2493
2457 /* 2494 /*
2458 * We only track unmovable, reclaimable and movable on pcp lists. 2495 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2469,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold)
2469 migratetype = MIGRATE_MOVABLE; 2506 migratetype = MIGRATE_MOVABLE;
2470 } 2507 }
2471 2508
2509 __count_vm_event(PGFREE);
2472 pcp = &this_cpu_ptr(zone->pageset)->pcp; 2510 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2473 if (!cold) 2511 if (!cold)
2474 list_add(&page->lru, &pcp->lists[migratetype]); 2512 list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2482,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold)
2482 } 2520 }
2483 2521
2484out: 2522out:
2485 local_irq_restore(flags); 2523 preempt_enable();
2486} 2524}
2487 2525
2488/* 2526/*
@@ -2600,74 +2638,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2600#endif 2638#endif
2601} 2639}
2602 2640
2641/* Remove page from the per-cpu list, caller must protect the list */
2642static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2643 bool cold, struct per_cpu_pages *pcp,
2644 struct list_head *list)
2645{
2646 struct page *page;
2647
2648 VM_BUG_ON(in_interrupt());
2649
2650 do {
2651 if (list_empty(list)) {
2652 pcp->count += rmqueue_bulk(zone, 0,
2653 pcp->batch, list,
2654 migratetype, cold);
2655 if (unlikely(list_empty(list)))
2656 return NULL;
2657 }
2658
2659 if (cold)
2660 page = list_last_entry(list, struct page, lru);
2661 else
2662 page = list_first_entry(list, struct page, lru);
2663
2664 list_del(&page->lru);
2665 pcp->count--;
2666 } while (check_new_pcp(page));
2667
2668 return page;
2669}
2670
2671/* Lock and remove page from the per-cpu list */
2672static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2673 struct zone *zone, unsigned int order,
2674 gfp_t gfp_flags, int migratetype)
2675{
2676 struct per_cpu_pages *pcp;
2677 struct list_head *list;
2678 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2679 struct page *page;
2680
2681 preempt_disable();
2682 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2683 list = &pcp->lists[migratetype];
2684 page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
2685 if (page) {
2686 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2687 zone_statistics(preferred_zone, zone);
2688 }
2689 preempt_enable();
2690 return page;
2691}
2692
2603/* 2693/*
2604 * Allocate a page from the given zone. Use pcplists for order-0 allocations. 2694 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2605 */ 2695 */
2606static inline 2696static inline
2607struct page *buffered_rmqueue(struct zone *preferred_zone, 2697struct page *rmqueue(struct zone *preferred_zone,
2608 struct zone *zone, unsigned int order, 2698 struct zone *zone, unsigned int order,
2609 gfp_t gfp_flags, unsigned int alloc_flags, 2699 gfp_t gfp_flags, unsigned int alloc_flags,
2610 int migratetype) 2700 int migratetype)
2611{ 2701{
2612 unsigned long flags; 2702 unsigned long flags;
2613 struct page *page; 2703 struct page *page;
2614 bool cold = ((gfp_flags & __GFP_COLD) != 0);
2615 2704
2616 if (likely(order == 0)) { 2705 if (likely(order == 0) && !in_interrupt()) {
2617 struct per_cpu_pages *pcp; 2706 page = rmqueue_pcplist(preferred_zone, zone, order,
2618 struct list_head *list; 2707 gfp_flags, migratetype);
2619 2708 goto out;
2620 local_irq_save(flags); 2709 }
2621 do {
2622 pcp = &this_cpu_ptr(zone->pageset)->pcp;
2623 list = &pcp->lists[migratetype];
2624 if (list_empty(list)) {
2625 pcp->count += rmqueue_bulk(zone, 0,
2626 pcp->batch, list,
2627 migratetype, cold);
2628 if (unlikely(list_empty(list)))
2629 goto failed;
2630 }
2631
2632 if (cold)
2633 page = list_last_entry(list, struct page, lru);
2634 else
2635 page = list_first_entry(list, struct page, lru);
2636
2637 list_del(&page->lru);
2638 pcp->count--;
2639 2710
2640 } while (check_new_pcp(page)); 2711 /*
2641 } else { 2712 * We most definitely don't want callers attempting to
2642 /* 2713 * allocate greater than order-1 page units with __GFP_NOFAIL.
2643 * We most definitely don't want callers attempting to 2714 */
2644 * allocate greater than order-1 page units with __GFP_NOFAIL. 2715 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2645 */ 2716 spin_lock_irqsave(&zone->lock, flags);
2646 WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2647 spin_lock_irqsave(&zone->lock, flags);
2648 2717
2649 do { 2718 do {
2650 page = NULL; 2719 page = NULL;
2651 if (alloc_flags & ALLOC_HARDER) { 2720 if (alloc_flags & ALLOC_HARDER) {
2652 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); 2721 page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2653 if (page) 2722 if (page)
2654 trace_mm_page_alloc_zone_locked(page, order, migratetype); 2723 trace_mm_page_alloc_zone_locked(page, order, migratetype);
2655 } 2724 }
2656 if (!page)
2657 page = __rmqueue(zone, order, migratetype);
2658 } while (page && check_new_pages(page, order));
2659 spin_unlock(&zone->lock);
2660 if (!page) 2725 if (!page)
2661 goto failed; 2726 page = __rmqueue(zone, order, migratetype);
2662 __mod_zone_freepage_state(zone, -(1 << order), 2727 } while (page && check_new_pages(page, order));
2663 get_pcppage_migratetype(page)); 2728 spin_unlock(&zone->lock);
2664 } 2729 if (!page)
2730 goto failed;
2731 __mod_zone_freepage_state(zone, -(1 << order),
2732 get_pcppage_migratetype(page));
2665 2733
2666 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); 2734 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2667 zone_statistics(preferred_zone, zone); 2735 zone_statistics(preferred_zone, zone);
2668 local_irq_restore(flags); 2736 local_irq_restore(flags);
2669 2737
2670 VM_BUG_ON_PAGE(bad_range(zone, page), page); 2738out:
2739 VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2671 return page; 2740 return page;
2672 2741
2673failed: 2742failed:
@@ -2875,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
2875#ifdef CONFIG_NUMA 2944#ifdef CONFIG_NUMA
2876static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 2945static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
2877{ 2946{
2878 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < 2947 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
2879 RECLAIM_DISTANCE; 2948 RECLAIM_DISTANCE;
2880} 2949}
2881#else /* CONFIG_NUMA */ 2950#else /* CONFIG_NUMA */
@@ -2972,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2972 } 3041 }
2973 3042
2974try_this_zone: 3043try_this_zone:
2975 page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, 3044 page = rmqueue(ac->preferred_zoneref->zone, zone, order,
2976 gfp_mask, alloc_flags, ac->migratetype); 3045 gfp_mask, alloc_flags, ac->migratetype);
2977 if (page) { 3046 if (page) {
2978 prep_new_page(page, order, gfp_mask, alloc_flags); 3047 prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3825,76 +3894,76 @@ got_pg:
3825 return page; 3894 return page;
3826} 3895}
3827 3896
3828/* 3897static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
3829 * This is the 'heart' of the zoned buddy allocator. 3898 struct zonelist *zonelist, nodemask_t *nodemask,
3830 */ 3899 struct alloc_context *ac, gfp_t *alloc_mask,
3831struct page * 3900 unsigned int *alloc_flags)
3832__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3833 struct zonelist *zonelist, nodemask_t *nodemask)
3834{ 3901{
3835 struct page *page; 3902 ac->high_zoneidx = gfp_zone(gfp_mask);
3836 unsigned int alloc_flags = ALLOC_WMARK_LOW; 3903 ac->zonelist = zonelist;
3837 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ 3904 ac->nodemask = nodemask;
3838 struct alloc_context ac = { 3905 ac->migratetype = gfpflags_to_migratetype(gfp_mask);
3839 .high_zoneidx = gfp_zone(gfp_mask),
3840 .zonelist = zonelist,
3841 .nodemask = nodemask,
3842 .migratetype = gfpflags_to_migratetype(gfp_mask),
3843 };
3844 3906
3845 if (cpusets_enabled()) { 3907 if (cpusets_enabled()) {
3846 alloc_mask |= __GFP_HARDWALL; 3908 *alloc_mask |= __GFP_HARDWALL;
3847 alloc_flags |= ALLOC_CPUSET; 3909 if (!ac->nodemask)
3848 if (!ac.nodemask) 3910 ac->nodemask = &cpuset_current_mems_allowed;
3849 ac.nodemask = &cpuset_current_mems_allowed; 3911 else
3912 *alloc_flags |= ALLOC_CPUSET;
3850 } 3913 }
3851 3914
3852 gfp_mask &= gfp_allowed_mask;
3853
3854 lockdep_trace_alloc(gfp_mask); 3915 lockdep_trace_alloc(gfp_mask);
3855 3916
3856 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 3917 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
3857 3918
3858 if (should_fail_alloc_page(gfp_mask, order)) 3919 if (should_fail_alloc_page(gfp_mask, order))
3859 return NULL; 3920 return false;
3860 3921
3861 /* 3922 if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
3862 * Check the zones suitable for the gfp_mask contain at least one 3923 *alloc_flags |= ALLOC_CMA;
3863 * valid zone. It's possible to have an empty zonelist as a result
3864 * of __GFP_THISNODE and a memoryless node
3865 */
3866 if (unlikely(!zonelist->_zonerefs->zone))
3867 return NULL;
3868 3924
3869 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) 3925 return true;
3870 alloc_flags |= ALLOC_CMA; 3926}
3871 3927
3928/* Determine whether to spread dirty pages and what the first usable zone */
3929static inline void finalise_ac(gfp_t gfp_mask,
3930 unsigned int order, struct alloc_context *ac)
3931{
3872 /* Dirty zone balancing only done in the fast path */ 3932 /* Dirty zone balancing only done in the fast path */
3873 ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); 3933 ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
3874 3934
3875 /* 3935 /*
3876 * The preferred zone is used for statistics but crucially it is 3936 * The preferred zone is used for statistics but crucially it is
3877 * also used as the starting point for the zonelist iterator. It 3937 * also used as the starting point for the zonelist iterator. It
3878 * may get reset for allocations that ignore memory policies. 3938 * may get reset for allocations that ignore memory policies.
3879 */ 3939 */
3880 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, 3940 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3881 ac.high_zoneidx, ac.nodemask); 3941 ac->high_zoneidx, ac->nodemask);
3882 if (!ac.preferred_zoneref->zone) { 3942}
3883 page = NULL; 3943
3884 /* 3944/*
3885 * This might be due to race with cpuset_current_mems_allowed 3945 * This is the 'heart' of the zoned buddy allocator.
3886 * update, so make sure we retry with original nodemask in the 3946 */
3887 * slow path. 3947struct page *
3888 */ 3948__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
3889 goto no_zone; 3949 struct zonelist *zonelist, nodemask_t *nodemask)
3890 } 3950{
3951 struct page *page;
3952 unsigned int alloc_flags = ALLOC_WMARK_LOW;
3953 gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
3954 struct alloc_context ac = { };
3955
3956 gfp_mask &= gfp_allowed_mask;
3957 if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
3958 return NULL;
3959
3960 finalise_ac(gfp_mask, order, &ac);
3891 3961
3892 /* First allocation attempt */ 3962 /* First allocation attempt */
3893 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); 3963 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
3894 if (likely(page)) 3964 if (likely(page))
3895 goto out; 3965 goto out;
3896 3966
3897no_zone:
3898 /* 3967 /*
3899 * Runtime PM, block IO and its error handling path can deadlock 3968 * Runtime PM, block IO and its error handling path can deadlock
3900 * because I/O on the device might not complete. 3969 * because I/O on the device might not complete.
@@ -5856,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
5856 * the zone and SPARSEMEM is in use. If there are holes within the 5925 * the zone and SPARSEMEM is in use. If there are holes within the
5857 * zone, each populated memory region may cost us one or two extra 5926 * zone, each populated memory region may cost us one or two extra
5858 * memmap pages due to alignment because memmap pages for each 5927 * memmap pages due to alignment because memmap pages for each
5859 * populated regions may not naturally algined on page boundary. 5928 * populated regions may not be naturally aligned on page boundary.
5860 * So the (present_pages >> 4) heuristic is a tradeoff for that. 5929 * So the (present_pages >> 4) heuristic is a tradeoff for that.
5861 */ 5930 */
5862 if (spanned_pages > present_pages + (present_pages >> 4) && 5931 if (spanned_pages > present_pages + (present_pages >> 4) &&
@@ -6420,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6420 6489
6421 start_pfn = end_pfn; 6490 start_pfn = end_pfn;
6422 } 6491 }
6423 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
6424 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
6425 6492
6426 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 6493 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
6427 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 6494 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -7157,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename,
7157 * If @count is not zero, it is okay to include less @count unmovable pages 7224 * If @count is not zero, it is okay to include less @count unmovable pages
7158 * 7225 *
7159 * PageLRU check without isolation or lru_lock could race so that 7226 * PageLRU check without isolation or lru_lock could race so that
7160 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 7227 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7161 * expect this function should be exact. 7228 * check without lock_page also may miss some movable non-lru pages at
7229 * race condition. So you can't expect this function should be exact.
7162 */ 7230 */
7163bool has_unmovable_pages(struct zone *zone, struct page *page, int count, 7231bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7164 bool skip_hwpoisoned_pages) 7232 bool skip_hwpoisoned_pages)
@@ -7214,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7214 if (skip_hwpoisoned_pages && PageHWPoison(page)) 7282 if (skip_hwpoisoned_pages && PageHWPoison(page))
7215 continue; 7283 continue;
7216 7284
7285 if (__PageMovable(page))
7286 continue;
7287
7217 if (!PageLRU(page)) 7288 if (!PageLRU(page))
7218 found++; 7289 found++;
7219 /* 7290 /*
@@ -7325,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7325 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks 7396 * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks
7326 * in range must have the same migratetype and it must 7397 * in range must have the same migratetype and it must
7327 * be either of the two. 7398 * be either of the two.
7399 * @gfp_mask: GFP mask to use during compaction
7328 * 7400 *
7329 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 7401 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
7330 * aligned, however it's the caller's responsibility to guarantee that 7402 * aligned, however it's the caller's responsibility to guarantee that
@@ -7338,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7338 * need to be freed with free_contig_range(). 7410 * need to be freed with free_contig_range().
7339 */ 7411 */
7340int alloc_contig_range(unsigned long start, unsigned long end, 7412int alloc_contig_range(unsigned long start, unsigned long end,
7341 unsigned migratetype) 7413 unsigned migratetype, gfp_t gfp_mask)
7342{ 7414{
7343 unsigned long outer_start, outer_end; 7415 unsigned long outer_start, outer_end;
7344 unsigned int order; 7416 unsigned int order;
@@ -7350,7 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
7350 .zone = page_zone(pfn_to_page(start)), 7422 .zone = page_zone(pfn_to_page(start)),
7351 .mode = MIGRATE_SYNC, 7423 .mode = MIGRATE_SYNC,
7352 .ignore_skip_hint = true, 7424 .ignore_skip_hint = true,
7353 .gfp_mask = GFP_KERNEL, 7425 .gfp_mask = memalloc_noio_flags(gfp_mask),
7354 }; 7426 };
7355 INIT_LIST_HEAD(&cc.migratepages); 7427 INIT_LIST_HEAD(&cc.migratepages);
7356 7428