diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 324 |
1 files changed, 198 insertions, 126 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c21b33668133..a7a6aac95a6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -59,7 +59,6 @@ | |||
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/mm_inline.h> | 60 | #include <linux/mm_inline.h> |
61 | #include <linux/migrate.h> | 61 | #include <linux/migrate.h> |
62 | #include <linux/page_ext.h> | ||
63 | #include <linux/hugetlb.h> | 62 | #include <linux/hugetlb.h> |
64 | #include <linux/sched/rt.h> | 63 | #include <linux/sched/rt.h> |
65 | #include <linux/page_owner.h> | 64 | #include <linux/page_owner.h> |
@@ -92,6 +91,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); | |||
92 | int _node_numa_mem_[MAX_NUMNODES]; | 91 | int _node_numa_mem_[MAX_NUMNODES]; |
93 | #endif | 92 | #endif |
94 | 93 | ||
94 | /* work_structs for global per-cpu drains */ | ||
95 | DEFINE_MUTEX(pcpu_drain_mutex); | ||
96 | DEFINE_PER_CPU(struct work_struct, pcpu_drain); | ||
97 | |||
95 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY | 98 | #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
96 | volatile unsigned long latent_entropy __latent_entropy; | 99 | volatile unsigned long latent_entropy __latent_entropy; |
97 | EXPORT_SYMBOL(latent_entropy); | 100 | EXPORT_SYMBOL(latent_entropy); |
@@ -1085,10 +1088,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
1085 | { | 1088 | { |
1086 | int migratetype = 0; | 1089 | int migratetype = 0; |
1087 | int batch_free = 0; | 1090 | int batch_free = 0; |
1088 | unsigned long nr_scanned; | 1091 | unsigned long nr_scanned, flags; |
1089 | bool isolated_pageblocks; | 1092 | bool isolated_pageblocks; |
1090 | 1093 | ||
1091 | spin_lock(&zone->lock); | 1094 | spin_lock_irqsave(&zone->lock, flags); |
1092 | isolated_pageblocks = has_isolate_pageblock(zone); | 1095 | isolated_pageblocks = has_isolate_pageblock(zone); |
1093 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); | 1096 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); |
1094 | if (nr_scanned) | 1097 | if (nr_scanned) |
@@ -1137,7 +1140,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
1137 | trace_mm_page_pcpu_drain(page, 0, mt); | 1140 | trace_mm_page_pcpu_drain(page, 0, mt); |
1138 | } while (--count && --batch_free && !list_empty(list)); | 1141 | } while (--count && --batch_free && !list_empty(list)); |
1139 | } | 1142 | } |
1140 | spin_unlock(&zone->lock); | 1143 | spin_unlock_irqrestore(&zone->lock, flags); |
1141 | } | 1144 | } |
1142 | 1145 | ||
1143 | static void free_one_page(struct zone *zone, | 1146 | static void free_one_page(struct zone *zone, |
@@ -1145,8 +1148,9 @@ static void free_one_page(struct zone *zone, | |||
1145 | unsigned int order, | 1148 | unsigned int order, |
1146 | int migratetype) | 1149 | int migratetype) |
1147 | { | 1150 | { |
1148 | unsigned long nr_scanned; | 1151 | unsigned long nr_scanned, flags; |
1149 | spin_lock(&zone->lock); | 1152 | spin_lock_irqsave(&zone->lock, flags); |
1153 | __count_vm_events(PGFREE, 1 << order); | ||
1150 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); | 1154 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); |
1151 | if (nr_scanned) | 1155 | if (nr_scanned) |
1152 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); | 1156 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); |
@@ -1156,7 +1160,7 @@ static void free_one_page(struct zone *zone, | |||
1156 | migratetype = get_pfnblock_migratetype(page, pfn); | 1160 | migratetype = get_pfnblock_migratetype(page, pfn); |
1157 | } | 1161 | } |
1158 | __free_one_page(page, pfn, zone, order, migratetype); | 1162 | __free_one_page(page, pfn, zone, order, migratetype); |
1159 | spin_unlock(&zone->lock); | 1163 | spin_unlock_irqrestore(&zone->lock, flags); |
1160 | } | 1164 | } |
1161 | 1165 | ||
1162 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | 1166 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
@@ -1234,7 +1238,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) | |||
1234 | 1238 | ||
1235 | static void __free_pages_ok(struct page *page, unsigned int order) | 1239 | static void __free_pages_ok(struct page *page, unsigned int order) |
1236 | { | 1240 | { |
1237 | unsigned long flags; | ||
1238 | int migratetype; | 1241 | int migratetype; |
1239 | unsigned long pfn = page_to_pfn(page); | 1242 | unsigned long pfn = page_to_pfn(page); |
1240 | 1243 | ||
@@ -1242,10 +1245,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
1242 | return; | 1245 | return; |
1243 | 1246 | ||
1244 | migratetype = get_pfnblock_migratetype(page, pfn); | 1247 | migratetype = get_pfnblock_migratetype(page, pfn); |
1245 | local_irq_save(flags); | ||
1246 | __count_vm_events(PGFREE, 1 << order); | ||
1247 | free_one_page(page_zone(page), page, pfn, order, migratetype); | 1248 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
1248 | local_irq_restore(flags); | ||
1249 | } | 1249 | } |
1250 | 1250 | ||
1251 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) | 1251 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) |
@@ -2217,8 +2217,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
2217 | int migratetype, bool cold) | 2217 | int migratetype, bool cold) |
2218 | { | 2218 | { |
2219 | int i, alloced = 0; | 2219 | int i, alloced = 0; |
2220 | unsigned long flags; | ||
2220 | 2221 | ||
2221 | spin_lock(&zone->lock); | 2222 | spin_lock_irqsave(&zone->lock, flags); |
2222 | for (i = 0; i < count; ++i) { | 2223 | for (i = 0; i < count; ++i) { |
2223 | struct page *page = __rmqueue(zone, order, migratetype); | 2224 | struct page *page = __rmqueue(zone, order, migratetype); |
2224 | if (unlikely(page == NULL)) | 2225 | if (unlikely(page == NULL)) |
@@ -2254,7 +2255,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
2254 | * pages added to the pcp list. | 2255 | * pages added to the pcp list. |
2255 | */ | 2256 | */ |
2256 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 2257 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
2257 | spin_unlock(&zone->lock); | 2258 | spin_unlock_irqrestore(&zone->lock, flags); |
2258 | return alloced; | 2259 | return alloced; |
2259 | } | 2260 | } |
2260 | 2261 | ||
@@ -2339,16 +2340,26 @@ void drain_local_pages(struct zone *zone) | |||
2339 | drain_pages(cpu); | 2340 | drain_pages(cpu); |
2340 | } | 2341 | } |
2341 | 2342 | ||
2343 | static void drain_local_pages_wq(struct work_struct *work) | ||
2344 | { | ||
2345 | /* | ||
2346 | * drain_all_pages doesn't use proper cpu hotplug protection so | ||
2347 | * we can race with cpu offline when the WQ can move this from | ||
2348 | * a cpu pinned worker to an unbound one. We can operate on a different | ||
2349 | * cpu which is allright but we also have to make sure to not move to | ||
2350 | * a different one. | ||
2351 | */ | ||
2352 | preempt_disable(); | ||
2353 | drain_local_pages(NULL); | ||
2354 | preempt_enable(); | ||
2355 | } | ||
2356 | |||
2342 | /* | 2357 | /* |
2343 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | 2358 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
2344 | * | 2359 | * |
2345 | * When zone parameter is non-NULL, spill just the single zone's pages. | 2360 | * When zone parameter is non-NULL, spill just the single zone's pages. |
2346 | * | 2361 | * |
2347 | * Note that this code is protected against sending an IPI to an offline | 2362 | * Note that this can be extremely slow as the draining happens in a workqueue. |
2348 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: | ||
2349 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but | ||
2350 | * nothing keeps CPUs from showing up after we populated the cpumask and | ||
2351 | * before the call to on_each_cpu_mask(). | ||
2352 | */ | 2363 | */ |
2353 | void drain_all_pages(struct zone *zone) | 2364 | void drain_all_pages(struct zone *zone) |
2354 | { | 2365 | { |
@@ -2360,6 +2371,21 @@ void drain_all_pages(struct zone *zone) | |||
2360 | */ | 2371 | */ |
2361 | static cpumask_t cpus_with_pcps; | 2372 | static cpumask_t cpus_with_pcps; |
2362 | 2373 | ||
2374 | /* Workqueues cannot recurse */ | ||
2375 | if (current->flags & PF_WQ_WORKER) | ||
2376 | return; | ||
2377 | |||
2378 | /* | ||
2379 | * Do not drain if one is already in progress unless it's specific to | ||
2380 | * a zone. Such callers are primarily CMA and memory hotplug and need | ||
2381 | * the drain to be complete when the call returns. | ||
2382 | */ | ||
2383 | if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { | ||
2384 | if (!zone) | ||
2385 | return; | ||
2386 | mutex_lock(&pcpu_drain_mutex); | ||
2387 | } | ||
2388 | |||
2363 | /* | 2389 | /* |
2364 | * We don't care about racing with CPU hotplug event | 2390 | * We don't care about racing with CPU hotplug event |
2365 | * as offline notification will cause the notified | 2391 | * as offline notification will cause the notified |
@@ -2390,8 +2416,16 @@ void drain_all_pages(struct zone *zone) | |||
2390 | else | 2416 | else |
2391 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | 2417 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
2392 | } | 2418 | } |
2393 | on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, | 2419 | |
2394 | zone, 1); | 2420 | for_each_cpu(cpu, &cpus_with_pcps) { |
2421 | struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); | ||
2422 | INIT_WORK(work, drain_local_pages_wq); | ||
2423 | schedule_work_on(cpu, work); | ||
2424 | } | ||
2425 | for_each_cpu(cpu, &cpus_with_pcps) | ||
2426 | flush_work(per_cpu_ptr(&pcpu_drain, cpu)); | ||
2427 | |||
2428 | mutex_unlock(&pcpu_drain_mutex); | ||
2395 | } | 2429 | } |
2396 | 2430 | ||
2397 | #ifdef CONFIG_HIBERNATION | 2431 | #ifdef CONFIG_HIBERNATION |
@@ -2442,17 +2476,20 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2442 | { | 2476 | { |
2443 | struct zone *zone = page_zone(page); | 2477 | struct zone *zone = page_zone(page); |
2444 | struct per_cpu_pages *pcp; | 2478 | struct per_cpu_pages *pcp; |
2445 | unsigned long flags; | ||
2446 | unsigned long pfn = page_to_pfn(page); | 2479 | unsigned long pfn = page_to_pfn(page); |
2447 | int migratetype; | 2480 | int migratetype; |
2448 | 2481 | ||
2482 | if (in_interrupt()) { | ||
2483 | __free_pages_ok(page, 0); | ||
2484 | return; | ||
2485 | } | ||
2486 | |||
2449 | if (!free_pcp_prepare(page)) | 2487 | if (!free_pcp_prepare(page)) |
2450 | return; | 2488 | return; |
2451 | 2489 | ||
2452 | migratetype = get_pfnblock_migratetype(page, pfn); | 2490 | migratetype = get_pfnblock_migratetype(page, pfn); |
2453 | set_pcppage_migratetype(page, migratetype); | 2491 | set_pcppage_migratetype(page, migratetype); |
2454 | local_irq_save(flags); | 2492 | preempt_disable(); |
2455 | __count_vm_event(PGFREE); | ||
2456 | 2493 | ||
2457 | /* | 2494 | /* |
2458 | * We only track unmovable, reclaimable and movable on pcp lists. | 2495 | * We only track unmovable, reclaimable and movable on pcp lists. |
@@ -2469,6 +2506,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2469 | migratetype = MIGRATE_MOVABLE; | 2506 | migratetype = MIGRATE_MOVABLE; |
2470 | } | 2507 | } |
2471 | 2508 | ||
2509 | __count_vm_event(PGFREE); | ||
2472 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2510 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2473 | if (!cold) | 2511 | if (!cold) |
2474 | list_add(&page->lru, &pcp->lists[migratetype]); | 2512 | list_add(&page->lru, &pcp->lists[migratetype]); |
@@ -2482,7 +2520,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2482 | } | 2520 | } |
2483 | 2521 | ||
2484 | out: | 2522 | out: |
2485 | local_irq_restore(flags); | 2523 | preempt_enable(); |
2486 | } | 2524 | } |
2487 | 2525 | ||
2488 | /* | 2526 | /* |
@@ -2600,74 +2638,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) | |||
2600 | #endif | 2638 | #endif |
2601 | } | 2639 | } |
2602 | 2640 | ||
2641 | /* Remove page from the per-cpu list, caller must protect the list */ | ||
2642 | static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | ||
2643 | bool cold, struct per_cpu_pages *pcp, | ||
2644 | struct list_head *list) | ||
2645 | { | ||
2646 | struct page *page; | ||
2647 | |||
2648 | VM_BUG_ON(in_interrupt()); | ||
2649 | |||
2650 | do { | ||
2651 | if (list_empty(list)) { | ||
2652 | pcp->count += rmqueue_bulk(zone, 0, | ||
2653 | pcp->batch, list, | ||
2654 | migratetype, cold); | ||
2655 | if (unlikely(list_empty(list))) | ||
2656 | return NULL; | ||
2657 | } | ||
2658 | |||
2659 | if (cold) | ||
2660 | page = list_last_entry(list, struct page, lru); | ||
2661 | else | ||
2662 | page = list_first_entry(list, struct page, lru); | ||
2663 | |||
2664 | list_del(&page->lru); | ||
2665 | pcp->count--; | ||
2666 | } while (check_new_pcp(page)); | ||
2667 | |||
2668 | return page; | ||
2669 | } | ||
2670 | |||
2671 | /* Lock and remove page from the per-cpu list */ | ||
2672 | static struct page *rmqueue_pcplist(struct zone *preferred_zone, | ||
2673 | struct zone *zone, unsigned int order, | ||
2674 | gfp_t gfp_flags, int migratetype) | ||
2675 | { | ||
2676 | struct per_cpu_pages *pcp; | ||
2677 | struct list_head *list; | ||
2678 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | ||
2679 | struct page *page; | ||
2680 | |||
2681 | preempt_disable(); | ||
2682 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
2683 | list = &pcp->lists[migratetype]; | ||
2684 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); | ||
2685 | if (page) { | ||
2686 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | ||
2687 | zone_statistics(preferred_zone, zone); | ||
2688 | } | ||
2689 | preempt_enable(); | ||
2690 | return page; | ||
2691 | } | ||
2692 | |||
2603 | /* | 2693 | /* |
2604 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. | 2694 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
2605 | */ | 2695 | */ |
2606 | static inline | 2696 | static inline |
2607 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 2697 | struct page *rmqueue(struct zone *preferred_zone, |
2608 | struct zone *zone, unsigned int order, | 2698 | struct zone *zone, unsigned int order, |
2609 | gfp_t gfp_flags, unsigned int alloc_flags, | 2699 | gfp_t gfp_flags, unsigned int alloc_flags, |
2610 | int migratetype) | 2700 | int migratetype) |
2611 | { | 2701 | { |
2612 | unsigned long flags; | 2702 | unsigned long flags; |
2613 | struct page *page; | 2703 | struct page *page; |
2614 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | ||
2615 | 2704 | ||
2616 | if (likely(order == 0)) { | 2705 | if (likely(order == 0) && !in_interrupt()) { |
2617 | struct per_cpu_pages *pcp; | 2706 | page = rmqueue_pcplist(preferred_zone, zone, order, |
2618 | struct list_head *list; | 2707 | gfp_flags, migratetype); |
2619 | 2708 | goto out; | |
2620 | local_irq_save(flags); | 2709 | } |
2621 | do { | ||
2622 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
2623 | list = &pcp->lists[migratetype]; | ||
2624 | if (list_empty(list)) { | ||
2625 | pcp->count += rmqueue_bulk(zone, 0, | ||
2626 | pcp->batch, list, | ||
2627 | migratetype, cold); | ||
2628 | if (unlikely(list_empty(list))) | ||
2629 | goto failed; | ||
2630 | } | ||
2631 | |||
2632 | if (cold) | ||
2633 | page = list_last_entry(list, struct page, lru); | ||
2634 | else | ||
2635 | page = list_first_entry(list, struct page, lru); | ||
2636 | |||
2637 | list_del(&page->lru); | ||
2638 | pcp->count--; | ||
2639 | 2710 | ||
2640 | } while (check_new_pcp(page)); | 2711 | /* |
2641 | } else { | 2712 | * We most definitely don't want callers attempting to |
2642 | /* | 2713 | * allocate greater than order-1 page units with __GFP_NOFAIL. |
2643 | * We most definitely don't want callers attempting to | 2714 | */ |
2644 | * allocate greater than order-1 page units with __GFP_NOFAIL. | 2715 | WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); |
2645 | */ | 2716 | spin_lock_irqsave(&zone->lock, flags); |
2646 | WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); | ||
2647 | spin_lock_irqsave(&zone->lock, flags); | ||
2648 | 2717 | ||
2649 | do { | 2718 | do { |
2650 | page = NULL; | 2719 | page = NULL; |
2651 | if (alloc_flags & ALLOC_HARDER) { | 2720 | if (alloc_flags & ALLOC_HARDER) { |
2652 | page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); | 2721 | page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); |
2653 | if (page) | 2722 | if (page) |
2654 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | 2723 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
2655 | } | 2724 | } |
2656 | if (!page) | ||
2657 | page = __rmqueue(zone, order, migratetype); | ||
2658 | } while (page && check_new_pages(page, order)); | ||
2659 | spin_unlock(&zone->lock); | ||
2660 | if (!page) | 2725 | if (!page) |
2661 | goto failed; | 2726 | page = __rmqueue(zone, order, migratetype); |
2662 | __mod_zone_freepage_state(zone, -(1 << order), | 2727 | } while (page && check_new_pages(page, order)); |
2663 | get_pcppage_migratetype(page)); | 2728 | spin_unlock(&zone->lock); |
2664 | } | 2729 | if (!page) |
2730 | goto failed; | ||
2731 | __mod_zone_freepage_state(zone, -(1 << order), | ||
2732 | get_pcppage_migratetype(page)); | ||
2665 | 2733 | ||
2666 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 2734 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2667 | zone_statistics(preferred_zone, zone); | 2735 | zone_statistics(preferred_zone, zone); |
2668 | local_irq_restore(flags); | 2736 | local_irq_restore(flags); |
2669 | 2737 | ||
2670 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 2738 | out: |
2739 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); | ||
2671 | return page; | 2740 | return page; |
2672 | 2741 | ||
2673 | failed: | 2742 | failed: |
@@ -2875,7 +2944,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, | |||
2875 | #ifdef CONFIG_NUMA | 2944 | #ifdef CONFIG_NUMA |
2876 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | 2945 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) |
2877 | { | 2946 | { |
2878 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < | 2947 | return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= |
2879 | RECLAIM_DISTANCE; | 2948 | RECLAIM_DISTANCE; |
2880 | } | 2949 | } |
2881 | #else /* CONFIG_NUMA */ | 2950 | #else /* CONFIG_NUMA */ |
@@ -2972,7 +3041,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, | |||
2972 | } | 3041 | } |
2973 | 3042 | ||
2974 | try_this_zone: | 3043 | try_this_zone: |
2975 | page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, | 3044 | page = rmqueue(ac->preferred_zoneref->zone, zone, order, |
2976 | gfp_mask, alloc_flags, ac->migratetype); | 3045 | gfp_mask, alloc_flags, ac->migratetype); |
2977 | if (page) { | 3046 | if (page) { |
2978 | prep_new_page(page, order, gfp_mask, alloc_flags); | 3047 | prep_new_page(page, order, gfp_mask, alloc_flags); |
@@ -3825,76 +3894,76 @@ got_pg: | |||
3825 | return page; | 3894 | return page; |
3826 | } | 3895 | } |
3827 | 3896 | ||
3828 | /* | 3897 | static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, |
3829 | * This is the 'heart' of the zoned buddy allocator. | 3898 | struct zonelist *zonelist, nodemask_t *nodemask, |
3830 | */ | 3899 | struct alloc_context *ac, gfp_t *alloc_mask, |
3831 | struct page * | 3900 | unsigned int *alloc_flags) |
3832 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
3833 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
3834 | { | 3901 | { |
3835 | struct page *page; | 3902 | ac->high_zoneidx = gfp_zone(gfp_mask); |
3836 | unsigned int alloc_flags = ALLOC_WMARK_LOW; | 3903 | ac->zonelist = zonelist; |
3837 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ | 3904 | ac->nodemask = nodemask; |
3838 | struct alloc_context ac = { | 3905 | ac->migratetype = gfpflags_to_migratetype(gfp_mask); |
3839 | .high_zoneidx = gfp_zone(gfp_mask), | ||
3840 | .zonelist = zonelist, | ||
3841 | .nodemask = nodemask, | ||
3842 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
3843 | }; | ||
3844 | 3906 | ||
3845 | if (cpusets_enabled()) { | 3907 | if (cpusets_enabled()) { |
3846 | alloc_mask |= __GFP_HARDWALL; | 3908 | *alloc_mask |= __GFP_HARDWALL; |
3847 | alloc_flags |= ALLOC_CPUSET; | 3909 | if (!ac->nodemask) |
3848 | if (!ac.nodemask) | 3910 | ac->nodemask = &cpuset_current_mems_allowed; |
3849 | ac.nodemask = &cpuset_current_mems_allowed; | 3911 | else |
3912 | *alloc_flags |= ALLOC_CPUSET; | ||
3850 | } | 3913 | } |
3851 | 3914 | ||
3852 | gfp_mask &= gfp_allowed_mask; | ||
3853 | |||
3854 | lockdep_trace_alloc(gfp_mask); | 3915 | lockdep_trace_alloc(gfp_mask); |
3855 | 3916 | ||
3856 | might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); | 3917 | might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); |
3857 | 3918 | ||
3858 | if (should_fail_alloc_page(gfp_mask, order)) | 3919 | if (should_fail_alloc_page(gfp_mask, order)) |
3859 | return NULL; | 3920 | return false; |
3860 | 3921 | ||
3861 | /* | 3922 | if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) |
3862 | * Check the zones suitable for the gfp_mask contain at least one | 3923 | *alloc_flags |= ALLOC_CMA; |
3863 | * valid zone. It's possible to have an empty zonelist as a result | ||
3864 | * of __GFP_THISNODE and a memoryless node | ||
3865 | */ | ||
3866 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
3867 | return NULL; | ||
3868 | 3924 | ||
3869 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) | 3925 | return true; |
3870 | alloc_flags |= ALLOC_CMA; | 3926 | } |
3871 | 3927 | ||
3928 | /* Determine whether to spread dirty pages and what the first usable zone */ | ||
3929 | static inline void finalise_ac(gfp_t gfp_mask, | ||
3930 | unsigned int order, struct alloc_context *ac) | ||
3931 | { | ||
3872 | /* Dirty zone balancing only done in the fast path */ | 3932 | /* Dirty zone balancing only done in the fast path */ |
3873 | ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); | 3933 | ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); |
3874 | 3934 | ||
3875 | /* | 3935 | /* |
3876 | * The preferred zone is used for statistics but crucially it is | 3936 | * The preferred zone is used for statistics but crucially it is |
3877 | * also used as the starting point for the zonelist iterator. It | 3937 | * also used as the starting point for the zonelist iterator. It |
3878 | * may get reset for allocations that ignore memory policies. | 3938 | * may get reset for allocations that ignore memory policies. |
3879 | */ | 3939 | */ |
3880 | ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, | 3940 | ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, |
3881 | ac.high_zoneidx, ac.nodemask); | 3941 | ac->high_zoneidx, ac->nodemask); |
3882 | if (!ac.preferred_zoneref->zone) { | 3942 | } |
3883 | page = NULL; | 3943 | |
3884 | /* | 3944 | /* |
3885 | * This might be due to race with cpuset_current_mems_allowed | 3945 | * This is the 'heart' of the zoned buddy allocator. |
3886 | * update, so make sure we retry with original nodemask in the | 3946 | */ |
3887 | * slow path. | 3947 | struct page * |
3888 | */ | 3948 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
3889 | goto no_zone; | 3949 | struct zonelist *zonelist, nodemask_t *nodemask) |
3890 | } | 3950 | { |
3951 | struct page *page; | ||
3952 | unsigned int alloc_flags = ALLOC_WMARK_LOW; | ||
3953 | gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ | ||
3954 | struct alloc_context ac = { }; | ||
3955 | |||
3956 | gfp_mask &= gfp_allowed_mask; | ||
3957 | if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) | ||
3958 | return NULL; | ||
3959 | |||
3960 | finalise_ac(gfp_mask, order, &ac); | ||
3891 | 3961 | ||
3892 | /* First allocation attempt */ | 3962 | /* First allocation attempt */ |
3893 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); | 3963 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
3894 | if (likely(page)) | 3964 | if (likely(page)) |
3895 | goto out; | 3965 | goto out; |
3896 | 3966 | ||
3897 | no_zone: | ||
3898 | /* | 3967 | /* |
3899 | * Runtime PM, block IO and its error handling path can deadlock | 3968 | * Runtime PM, block IO and its error handling path can deadlock |
3900 | * because I/O on the device might not complete. | 3969 | * because I/O on the device might not complete. |
@@ -5856,7 +5925,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | |||
5856 | * the zone and SPARSEMEM is in use. If there are holes within the | 5925 | * the zone and SPARSEMEM is in use. If there are holes within the |
5857 | * zone, each populated memory region may cost us one or two extra | 5926 | * zone, each populated memory region may cost us one or two extra |
5858 | * memmap pages due to alignment because memmap pages for each | 5927 | * memmap pages due to alignment because memmap pages for each |
5859 | * populated regions may not naturally algined on page boundary. | 5928 | * populated regions may not be naturally aligned on page boundary. |
5860 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | 5929 | * So the (present_pages >> 4) heuristic is a tradeoff for that. |
5861 | */ | 5930 | */ |
5862 | if (spanned_pages > present_pages + (present_pages >> 4) && | 5931 | if (spanned_pages > present_pages + (present_pages >> 4) && |
@@ -6420,8 +6489,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
6420 | 6489 | ||
6421 | start_pfn = end_pfn; | 6490 | start_pfn = end_pfn; |
6422 | } | 6491 | } |
6423 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; | ||
6424 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; | ||
6425 | 6492 | ||
6426 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 6493 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
6427 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 6494 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
@@ -7157,8 +7224,9 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
7157 | * If @count is not zero, it is okay to include less @count unmovable pages | 7224 | * If @count is not zero, it is okay to include less @count unmovable pages |
7158 | * | 7225 | * |
7159 | * PageLRU check without isolation or lru_lock could race so that | 7226 | * PageLRU check without isolation or lru_lock could race so that |
7160 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 7227 | * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable |
7161 | * expect this function should be exact. | 7228 | * check without lock_page also may miss some movable non-lru pages at |
7229 | * race condition. So you can't expect this function should be exact. | ||
7162 | */ | 7230 | */ |
7163 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | 7231 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
7164 | bool skip_hwpoisoned_pages) | 7232 | bool skip_hwpoisoned_pages) |
@@ -7214,6 +7282,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7214 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | 7282 | if (skip_hwpoisoned_pages && PageHWPoison(page)) |
7215 | continue; | 7283 | continue; |
7216 | 7284 | ||
7285 | if (__PageMovable(page)) | ||
7286 | continue; | ||
7287 | |||
7217 | if (!PageLRU(page)) | 7288 | if (!PageLRU(page)) |
7218 | found++; | 7289 | found++; |
7219 | /* | 7290 | /* |
@@ -7325,6 +7396,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
7325 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | 7396 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks |
7326 | * in range must have the same migratetype and it must | 7397 | * in range must have the same migratetype and it must |
7327 | * be either of the two. | 7398 | * be either of the two. |
7399 | * @gfp_mask: GFP mask to use during compaction | ||
7328 | * | 7400 | * |
7329 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | 7401 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES |
7330 | * aligned, however it's the caller's responsibility to guarantee that | 7402 | * aligned, however it's the caller's responsibility to guarantee that |
@@ -7338,7 +7410,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
7338 | * need to be freed with free_contig_range(). | 7410 | * need to be freed with free_contig_range(). |
7339 | */ | 7411 | */ |
7340 | int alloc_contig_range(unsigned long start, unsigned long end, | 7412 | int alloc_contig_range(unsigned long start, unsigned long end, |
7341 | unsigned migratetype) | 7413 | unsigned migratetype, gfp_t gfp_mask) |
7342 | { | 7414 | { |
7343 | unsigned long outer_start, outer_end; | 7415 | unsigned long outer_start, outer_end; |
7344 | unsigned int order; | 7416 | unsigned int order; |
@@ -7350,7 +7422,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
7350 | .zone = page_zone(pfn_to_page(start)), | 7422 | .zone = page_zone(pfn_to_page(start)), |
7351 | .mode = MIGRATE_SYNC, | 7423 | .mode = MIGRATE_SYNC, |
7352 | .ignore_skip_hint = true, | 7424 | .ignore_skip_hint = true, |
7353 | .gfp_mask = GFP_KERNEL, | 7425 | .gfp_mask = memalloc_noio_flags(gfp_mask), |
7354 | }; | 7426 | }; |
7355 | INIT_LIST_HEAD(&cc.migratepages); | 7427 | INIT_LIST_HEAD(&cc.migratepages); |
7356 | 7428 | ||