aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c526
1 files changed, 234 insertions, 292 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
28#include <linux/kasan.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include <linux/suspend.h> 30#include <linux/suspend.h>
30#include <linux/pagevec.h> 31#include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
172 * 1G machine -> (16M dma, 784M normal, 224M high) 173 * 1G machine -> (16M dma, 784M normal, 224M high)
173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 174 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 175 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
175 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 176 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
176 * 177 *
177 * TBD: should special case ZONE_DMA32 machines here - in those we normally 178 * TBD: should special case ZONE_DMA32 machines here - in those we normally
178 * don't need any ZONE_NORMAL reservation 179 * don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244 PB_migrate, PB_migrate_end); 245 PB_migrate, PB_migrate_end);
245} 246}
246 247
247bool oom_killer_disabled __read_mostly;
248
249#ifdef CONFIG_DEBUG_VM 248#ifdef CONFIG_DEBUG_VM
250static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
251{ 250{
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
381 } 380 }
382} 381}
383 382
384/* update __split_huge_page_refcount if you change this function */
385static int destroy_compound_page(struct page *page, unsigned long order)
386{
387 int i;
388 int nr_pages = 1 << order;
389 int bad = 0;
390
391 if (unlikely(compound_order(page) != order)) {
392 bad_page(page, "wrong compound order", 0);
393 bad++;
394 }
395
396 __ClearPageHead(page);
397
398 for (i = 1; i < nr_pages; i++) {
399 struct page *p = page + i;
400
401 if (unlikely(!PageTail(p))) {
402 bad_page(page, "PageTail not set", 0);
403 bad++;
404 } else if (unlikely(p->first_page != page)) {
405 bad_page(page, "first_page not consistent", 0);
406 bad++;
407 }
408 __ClearPageTail(p);
409 }
410
411 return bad;
412}
413
414static inline void prep_zero_page(struct page *page, unsigned int order, 383static inline void prep_zero_page(struct page *page, unsigned int order,
415 gfp_t gfp_flags) 384 gfp_t gfp_flags)
416{ 385{
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
552 return 0; 521 return 0;
553 522
554 if (page_is_guard(buddy) && page_order(buddy) == order) { 523 if (page_is_guard(buddy) && page_order(buddy) == order) {
555 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
556
557 if (page_zone_id(page) != page_zone_id(buddy)) 524 if (page_zone_id(page) != page_zone_id(buddy))
558 return 0; 525 return 0;
559 526
527 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
528
560 return 1; 529 return 1;
561 } 530 }
562 531
563 if (PageBuddy(buddy) && page_order(buddy) == order) { 532 if (PageBuddy(buddy) && page_order(buddy) == order) {
564 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
565
566 /* 533 /*
567 * zone check is done late to avoid uselessly 534 * zone check is done late to avoid uselessly
568 * calculating zone/node ids for pages that could 535 * calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
571 if (page_zone_id(page) != page_zone_id(buddy)) 538 if (page_zone_id(page) != page_zone_id(buddy))
572 return 0; 539 return 0;
573 540
541 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
542
574 return 1; 543 return 1;
575 } 544 }
576 return 0; 545 return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
613 int max_order = MAX_ORDER; 582 int max_order = MAX_ORDER;
614 583
615 VM_BUG_ON(!zone_is_initialized(zone)); 584 VM_BUG_ON(!zone_is_initialized(zone));
616 585 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
617 if (unlikely(PageCompound(page)))
618 if (unlikely(destroy_compound_page(page, order)))
619 return;
620 586
621 VM_BUG_ON(migratetype == -1); 587 VM_BUG_ON(migratetype == -1);
622 if (is_migrate_isolate(migratetype)) { 588 if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
797 spin_unlock(&zone->lock); 763 spin_unlock(&zone->lock);
798} 764}
799 765
766static int free_tail_pages_check(struct page *head_page, struct page *page)
767{
768 if (!IS_ENABLED(CONFIG_DEBUG_VM))
769 return 0;
770 if (unlikely(!PageTail(page))) {
771 bad_page(page, "PageTail not set", 0);
772 return 1;
773 }
774 if (unlikely(page->first_page != head_page)) {
775 bad_page(page, "first_page not consistent", 0);
776 return 1;
777 }
778 return 0;
779}
780
800static bool free_pages_prepare(struct page *page, unsigned int order) 781static bool free_pages_prepare(struct page *page, unsigned int order)
801{ 782{
802 int i; 783 bool compound = PageCompound(page);
803 int bad = 0; 784 int i, bad = 0;
804 785
805 VM_BUG_ON_PAGE(PageTail(page), page); 786 VM_BUG_ON_PAGE(PageTail(page), page);
806 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); 787 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
807 788
808 trace_mm_page_free(page, order); 789 trace_mm_page_free(page, order);
809 kmemcheck_free_shadow(page, order); 790 kmemcheck_free_shadow(page, order);
791 kasan_free_pages(page, order);
810 792
811 if (PageAnon(page)) 793 if (PageAnon(page))
812 page->mapping = NULL; 794 page->mapping = NULL;
813 for (i = 0; i < (1 << order); i++) 795 bad += free_pages_check(page);
796 for (i = 1; i < (1 << order); i++) {
797 if (compound)
798 bad += free_tail_pages_check(page, page + i);
814 bad += free_pages_check(page + i); 799 bad += free_pages_check(page + i);
800 }
815 if (bad) 801 if (bad)
816 return false; 802 return false;
817 803
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
970 return 0; 956 return 0;
971} 957}
972 958
973static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 959static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
960 int alloc_flags)
974{ 961{
975 int i; 962 int i;
976 963
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
985 972
986 arch_alloc_page(page, order); 973 arch_alloc_page(page, order);
987 kernel_map_pages(page, 1 << order, 1); 974 kernel_map_pages(page, 1 << order, 1);
975 kasan_alloc_pages(page, order);
988 976
989 if (gfp_flags & __GFP_ZERO) 977 if (gfp_flags & __GFP_ZERO)
990 prep_zero_page(page, order, gfp_flags); 978 prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
994 982
995 set_page_owner(page, order, gfp_flags); 983 set_page_owner(page, order, gfp_flags);
996 984
985 /*
986 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
987 * allocate the page. The expectation is that the caller is taking
988 * steps that will free more memory. The caller should avoid the page
989 * being used for !PFMEMALLOC purposes.
990 */
991 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
992
997 return 0; 993 return 0;
998} 994}
999 995
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
1130} 1126}
1131 1127
1132/* 1128/*
1133 * If breaking a large block of pages, move all free pages to the preferred 1129 * When we are falling back to another migratetype during allocation, try to
1134 * allocation list. If falling back for a reclaimable kernel allocation, be 1130 * steal extra free pages from the same pageblocks to satisfy further
1135 * more aggressive about taking ownership of free pages. 1131 * allocations, instead of polluting multiple pageblocks.
1136 * 1132 *
1137 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1133 * If we are stealing a relatively large buddy page, it is likely there will
1138 * nor move CMA pages to different free lists. We don't want unmovable pages 1134 * be more free pages in the pageblock, so try to steal them all. For
1139 * to be allocated from MIGRATE_CMA areas. 1135 * reclaimable and unmovable allocations, we steal regardless of page size,
1136 * as fragmentation caused by those allocations polluting movable pageblocks
1137 * is worse than movable allocations stealing from unmovable and reclaimable
1138 * pageblocks.
1140 * 1139 *
1141 * Returns the new migratetype of the pageblock (or the same old migratetype 1140 * If we claim more than half of the pageblock, change pageblock's migratetype
1142 * if it was unchanged). 1141 * as well.
1143 */ 1142 */
1144static int try_to_steal_freepages(struct zone *zone, struct page *page, 1143static void try_to_steal_freepages(struct zone *zone, struct page *page,
1145 int start_type, int fallback_type) 1144 int start_type, int fallback_type)
1146{ 1145{
1147 int current_order = page_order(page); 1146 int current_order = page_order(page);
1148 1147
1149 /*
1150 * When borrowing from MIGRATE_CMA, we need to release the excess
1151 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1152 * is set to CMA so it is returned to the correct freelist in case
1153 * the page ends up being not actually allocated from the pcp lists.
1154 */
1155 if (is_migrate_cma(fallback_type))
1156 return fallback_type;
1157
1158 /* Take ownership for orders >= pageblock_order */ 1148 /* Take ownership for orders >= pageblock_order */
1159 if (current_order >= pageblock_order) { 1149 if (current_order >= pageblock_order) {
1160 change_pageblock_range(page, current_order, start_type); 1150 change_pageblock_range(page, current_order, start_type);
1161 return start_type; 1151 return;
1162 } 1152 }
1163 1153
1164 if (current_order >= pageblock_order / 2 || 1154 if (current_order >= pageblock_order / 2 ||
1165 start_type == MIGRATE_RECLAIMABLE || 1155 start_type == MIGRATE_RECLAIMABLE ||
1156 start_type == MIGRATE_UNMOVABLE ||
1166 page_group_by_mobility_disabled) { 1157 page_group_by_mobility_disabled) {
1167 int pages; 1158 int pages;
1168 1159
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1170 1161
1171 /* Claim the whole block if over half of it is free */ 1162 /* Claim the whole block if over half of it is free */
1172 if (pages >= (1 << (pageblock_order-1)) || 1163 if (pages >= (1 << (pageblock_order-1)) ||
1173 page_group_by_mobility_disabled) { 1164 page_group_by_mobility_disabled)
1174
1175 set_pageblock_migratetype(page, start_type); 1165 set_pageblock_migratetype(page, start_type);
1176 return start_type;
1177 }
1178
1179 } 1166 }
1180
1181 return fallback_type;
1182} 1167}
1183 1168
1184/* Remove an element from the buddy allocator from the fallback list */ 1169/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1188 struct free_area *area; 1173 struct free_area *area;
1189 unsigned int current_order; 1174 unsigned int current_order;
1190 struct page *page; 1175 struct page *page;
1191 int migratetype, new_type, i;
1192 1176
1193 /* Find the largest possible block of pages in the other list */ 1177 /* Find the largest possible block of pages in the other list */
1194 for (current_order = MAX_ORDER-1; 1178 for (current_order = MAX_ORDER-1;
1195 current_order >= order && current_order <= MAX_ORDER-1; 1179 current_order >= order && current_order <= MAX_ORDER-1;
1196 --current_order) { 1180 --current_order) {
1181 int i;
1197 for (i = 0;; i++) { 1182 for (i = 0;; i++) {
1198 migratetype = fallbacks[start_migratetype][i]; 1183 int migratetype = fallbacks[start_migratetype][i];
1184 int buddy_type = start_migratetype;
1199 1185
1200 /* MIGRATE_RESERVE handled later if necessary */ 1186 /* MIGRATE_RESERVE handled later if necessary */
1201 if (migratetype == MIGRATE_RESERVE) 1187 if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1209 struct page, lru); 1195 struct page, lru);
1210 area->nr_free--; 1196 area->nr_free--;
1211 1197
1212 new_type = try_to_steal_freepages(zone, page, 1198 if (!is_migrate_cma(migratetype)) {
1213 start_migratetype, 1199 try_to_steal_freepages(zone, page,
1214 migratetype); 1200 start_migratetype,
1201 migratetype);
1202 } else {
1203 /*
1204 * When borrowing from MIGRATE_CMA, we need to
1205 * release the excess buddy pages to CMA
1206 * itself, and we do not try to steal extra
1207 * free pages.
1208 */
1209 buddy_type = migratetype;
1210 }
1215 1211
1216 /* Remove the page from the freelists */ 1212 /* Remove the page from the freelists */
1217 list_del(&page->lru); 1213 list_del(&page->lru);
1218 rmv_page_order(page); 1214 rmv_page_order(page);
1219 1215
1220 expand(zone, page, order, current_order, area, 1216 expand(zone, page, order, current_order, area,
1221 new_type); 1217 buddy_type);
1222 /* The freepage_migratetype may differ from pageblock's 1218
1219 /*
1220 * The freepage_migratetype may differ from pageblock's
1223 * migratetype depending on the decisions in 1221 * migratetype depending on the decisions in
1224 * try_to_steal_freepages. This is OK as long as it does 1222 * try_to_steal_freepages(). This is OK as long as it
1225 * not differ for MIGRATE_CMA type. 1223 * does not differ for MIGRATE_CMA pageblocks. For CMA
1224 * we need to make sure unallocated pages flushed from
1225 * pcp lists are returned to the correct freelist.
1226 */ 1226 */
1227 set_freepage_migratetype(page, new_type); 1227 set_freepage_migratetype(page, buddy_type);
1228 1228
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1229 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype, new_type); 1230 start_migratetype, migratetype);
1231 1231
1232 return page; 1232 return page;
1233 } 1233 }
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
1642} 1642}
1643 1643
1644/* 1644/*
1645 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1645 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
1646 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1647 * or two.
1648 */ 1646 */
1649static inline 1647static inline
1650struct page *buffered_rmqueue(struct zone *preferred_zone, 1648struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1655 struct page *page; 1653 struct page *page;
1656 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1654 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1657 1655
1658again:
1659 if (likely(order == 0)) { 1656 if (likely(order == 0)) {
1660 struct per_cpu_pages *pcp; 1657 struct per_cpu_pages *pcp;
1661 struct list_head *list; 1658 struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
1711 local_irq_restore(flags); 1708 local_irq_restore(flags);
1712 1709
1713 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1710 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1714 if (prep_new_page(page, order, gfp_flags))
1715 goto again;
1716 return page; 1711 return page;
1717 1712
1718failed: 1713failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
2033 * a page. 2028 * a page.
2034 */ 2029 */
2035static struct page * 2030static struct page *
2036get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 2031get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2037 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 2032 const struct alloc_context *ac)
2038 struct zone *preferred_zone, int classzone_idx, int migratetype)
2039{ 2033{
2034 struct zonelist *zonelist = ac->zonelist;
2040 struct zoneref *z; 2035 struct zoneref *z;
2041 struct page *page = NULL; 2036 struct page *page = NULL;
2042 struct zone *zone; 2037 struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
2055 * Scan zonelist, looking for a zone with enough free. 2050 * Scan zonelist, looking for a zone with enough free.
2056 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2051 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
2057 */ 2052 */
2058 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2053 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2059 high_zoneidx, nodemask) { 2054 ac->nodemask) {
2060 unsigned long mark; 2055 unsigned long mark;
2061 2056
2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2057 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
2073 * time the page has in memory before being reclaimed. 2068 * time the page has in memory before being reclaimed.
2074 */ 2069 */
2075 if (alloc_flags & ALLOC_FAIR) { 2070 if (alloc_flags & ALLOC_FAIR) {
2076 if (!zone_local(preferred_zone, zone)) 2071 if (!zone_local(ac->preferred_zone, zone))
2077 break; 2072 break;
2078 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { 2073 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
2079 nr_fair_skipped++; 2074 nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
2111 2106
2112 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2107 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2113 if (!zone_watermark_ok(zone, order, mark, 2108 if (!zone_watermark_ok(zone, order, mark,
2114 classzone_idx, alloc_flags)) { 2109 ac->classzone_idx, alloc_flags)) {
2115 int ret; 2110 int ret;
2116 2111
2117 /* Checked here to keep the fast path fast */ 2112 /* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
2132 } 2127 }
2133 2128
2134 if (zone_reclaim_mode == 0 || 2129 if (zone_reclaim_mode == 0 ||
2135 !zone_allows_reclaim(preferred_zone, zone)) 2130 !zone_allows_reclaim(ac->preferred_zone, zone))
2136 goto this_zone_full; 2131 goto this_zone_full;
2137 2132
2138 /* 2133 /*
@@ -2154,7 +2149,7 @@ zonelist_scan:
2154 default: 2149 default:
2155 /* did we reclaim enough */ 2150 /* did we reclaim enough */
2156 if (zone_watermark_ok(zone, order, mark, 2151 if (zone_watermark_ok(zone, order, mark,
2157 classzone_idx, alloc_flags)) 2152 ac->classzone_idx, alloc_flags))
2158 goto try_this_zone; 2153 goto try_this_zone;
2159 2154
2160 /* 2155 /*
@@ -2175,27 +2170,18 @@ zonelist_scan:
2175 } 2170 }
2176 2171
2177try_this_zone: 2172try_this_zone:
2178 page = buffered_rmqueue(preferred_zone, zone, order, 2173 page = buffered_rmqueue(ac->preferred_zone, zone, order,
2179 gfp_mask, migratetype); 2174 gfp_mask, ac->migratetype);
2180 if (page) 2175 if (page) {
2181 break; 2176 if (prep_new_page(page, order, gfp_mask, alloc_flags))
2177 goto try_this_zone;
2178 return page;
2179 }
2182this_zone_full: 2180this_zone_full:
2183 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2181 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2184 zlc_mark_zone_full(zonelist, z); 2182 zlc_mark_zone_full(zonelist, z);
2185 } 2183 }
2186 2184
2187 if (page) {
2188 /*
2189 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2190 * necessary to allocate the page. The expectation is
2191 * that the caller is taking steps that will free more
2192 * memory. The caller should avoid the page being used
2193 * for !PFMEMALLOC purposes.
2194 */
2195 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2196 return page;
2197 }
2198
2199 /* 2185 /*
2200 * The first pass makes sure allocations are spread fairly within the 2186 * The first pass makes sure allocations are spread fairly within the
2201 * local node. However, the local node might have free pages left 2187 * local node. However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
2208 alloc_flags &= ~ALLOC_FAIR; 2194 alloc_flags &= ~ALLOC_FAIR;
2209 if (nr_fair_skipped) { 2195 if (nr_fair_skipped) {
2210 zonelist_rescan = true; 2196 zonelist_rescan = true;
2211 reset_alloc_batches(preferred_zone); 2197 reset_alloc_batches(ac->preferred_zone);
2212 } 2198 }
2213 if (nr_online_nodes > 1) 2199 if (nr_online_nodes > 1)
2214 zonelist_rescan = true; 2200 zonelist_rescan = true;
@@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2330 2316
2331static inline struct page * 2317static inline struct page *
2332__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2318__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2333 struct zonelist *zonelist, enum zone_type high_zoneidx, 2319 const struct alloc_context *ac, unsigned long *did_some_progress)
2334 nodemask_t *nodemask, struct zone *preferred_zone,
2335 int classzone_idx, int migratetype)
2336{ 2320{
2337 struct page *page; 2321 struct page *page;
2338 2322
2339 /* Acquire the per-zone oom lock for each zone */ 2323 *did_some_progress = 0;
2340 if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
2341 schedule_timeout_uninterruptible(1);
2342 return NULL;
2343 }
2344 2324
2345 /* 2325 /*
2346 * PM-freezer should be notified that there might be an OOM killer on 2326 * Acquire the per-zone oom lock for each zone. If that
2347 * its way to kill and wake somebody up. This is too early and we might 2327 * fails, somebody else is making progress for us.
2348 * end up not killing anything but false positives are acceptable.
2349 * See freeze_processes.
2350 */ 2328 */
2351 note_oom_kill(); 2329 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
2330 *did_some_progress = 1;
2331 schedule_timeout_uninterruptible(1);
2332 return NULL;
2333 }
2352 2334
2353 /* 2335 /*
2354 * Go through the zonelist yet one more time, keep very high watermark 2336 * Go through the zonelist yet one more time, keep very high watermark
2355 * here, this is only to catch a parallel oom killing, we must fail if 2337 * here, this is only to catch a parallel oom killing, we must fail if
2356 * we're still under heavy pressure. 2338 * we're still under heavy pressure.
2357 */ 2339 */
2358 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2340 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
2359 order, zonelist, high_zoneidx, 2341 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
2360 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2361 preferred_zone, classzone_idx, migratetype);
2362 if (page) 2342 if (page)
2363 goto out; 2343 goto out;
2364 2344
2365 if (!(gfp_mask & __GFP_NOFAIL)) { 2345 if (!(gfp_mask & __GFP_NOFAIL)) {
2346 /* Coredumps can quickly deplete all memory reserves */
2347 if (current->flags & PF_DUMPCORE)
2348 goto out;
2366 /* The OOM killer will not help higher order allocs */ 2349 /* The OOM killer will not help higher order allocs */
2367 if (order > PAGE_ALLOC_COSTLY_ORDER) 2350 if (order > PAGE_ALLOC_COSTLY_ORDER)
2368 goto out; 2351 goto out;
2369 /* The OOM killer does not needlessly kill tasks for lowmem */ 2352 /* The OOM killer does not needlessly kill tasks for lowmem */
2370 if (high_zoneidx < ZONE_NORMAL) 2353 if (ac->high_zoneidx < ZONE_NORMAL)
2354 goto out;
2355 /* The OOM killer does not compensate for light reclaim */
2356 if (!(gfp_mask & __GFP_FS))
2371 goto out; 2357 goto out;
2372 /* 2358 /*
2373 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2359 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
@@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2380 goto out; 2366 goto out;
2381 } 2367 }
2382 /* Exhausted what can be done so it's blamo time */ 2368 /* Exhausted what can be done so it's blamo time */
2383 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2369 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2384 2370 *did_some_progress = 1;
2385out: 2371out:
2386 oom_zonelist_unlock(zonelist, gfp_mask); 2372 oom_zonelist_unlock(ac->zonelist, gfp_mask);
2387 return page; 2373 return page;
2388} 2374}
2389 2375
@@ -2391,10 +2377,9 @@ out:
2391/* Try memory compaction for high-order allocations before reclaim */ 2377/* Try memory compaction for high-order allocations before reclaim */
2392static struct page * 2378static struct page *
2393__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2379__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2394 struct zonelist *zonelist, enum zone_type high_zoneidx, 2380 int alloc_flags, const struct alloc_context *ac,
2395 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2381 enum migrate_mode mode, int *contended_compaction,
2396 int classzone_idx, int migratetype, enum migrate_mode mode, 2382 bool *deferred_compaction)
2397 int *contended_compaction, bool *deferred_compaction)
2398{ 2383{
2399 unsigned long compact_result; 2384 unsigned long compact_result;
2400 struct page *page; 2385 struct page *page;
@@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2403 return NULL; 2388 return NULL;
2404 2389
2405 current->flags |= PF_MEMALLOC; 2390 current->flags |= PF_MEMALLOC;
2406 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2391 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
2407 nodemask, mode, 2392 mode, contended_compaction);
2408 contended_compaction,
2409 alloc_flags, classzone_idx);
2410 current->flags &= ~PF_MEMALLOC; 2393 current->flags &= ~PF_MEMALLOC;
2411 2394
2412 switch (compact_result) { 2395 switch (compact_result) {
@@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2425 */ 2408 */
2426 count_vm_event(COMPACTSTALL); 2409 count_vm_event(COMPACTSTALL);
2427 2410
2428 page = get_page_from_freelist(gfp_mask, nodemask, 2411 page = get_page_from_freelist(gfp_mask, order,
2429 order, zonelist, high_zoneidx, 2412 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2430 alloc_flags & ~ALLOC_NO_WATERMARKS,
2431 preferred_zone, classzone_idx, migratetype);
2432 2413
2433 if (page) { 2414 if (page) {
2434 struct zone *zone = page_zone(page); 2415 struct zone *zone = page_zone(page);
@@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2452#else 2433#else
2453static inline struct page * 2434static inline struct page *
2454__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2435__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2455 struct zonelist *zonelist, enum zone_type high_zoneidx, 2436 int alloc_flags, const struct alloc_context *ac,
2456 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2437 enum migrate_mode mode, int *contended_compaction,
2457 int classzone_idx, int migratetype, enum migrate_mode mode, 2438 bool *deferred_compaction)
2458 int *contended_compaction, bool *deferred_compaction)
2459{ 2439{
2460 return NULL; 2440 return NULL;
2461} 2441}
@@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2463 2443
2464/* Perform direct synchronous page reclaim */ 2444/* Perform direct synchronous page reclaim */
2465static int 2445static int
2466__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2446__perform_reclaim(gfp_t gfp_mask, unsigned int order,
2467 nodemask_t *nodemask) 2447 const struct alloc_context *ac)
2468{ 2448{
2469 struct reclaim_state reclaim_state; 2449 struct reclaim_state reclaim_state;
2470 int progress; 2450 int progress;
@@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2478 reclaim_state.reclaimed_slab = 0; 2458 reclaim_state.reclaimed_slab = 0;
2479 current->reclaim_state = &reclaim_state; 2459 current->reclaim_state = &reclaim_state;
2480 2460
2481 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2461 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
2462 ac->nodemask);
2482 2463
2483 current->reclaim_state = NULL; 2464 current->reclaim_state = NULL;
2484 lockdep_clear_current_reclaim_state(); 2465 lockdep_clear_current_reclaim_state();
@@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2492/* The really slow allocator path where we enter direct reclaim */ 2473/* The really slow allocator path where we enter direct reclaim */
2493static inline struct page * 2474static inline struct page *
2494__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2475__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2495 struct zonelist *zonelist, enum zone_type high_zoneidx, 2476 int alloc_flags, const struct alloc_context *ac,
2496 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2477 unsigned long *did_some_progress)
2497 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2498{ 2478{
2499 struct page *page = NULL; 2479 struct page *page = NULL;
2500 bool drained = false; 2480 bool drained = false;
2501 2481
2502 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2482 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
2503 nodemask);
2504 if (unlikely(!(*did_some_progress))) 2483 if (unlikely(!(*did_some_progress)))
2505 return NULL; 2484 return NULL;
2506 2485
2507 /* After successful reclaim, reconsider all zones for allocation */ 2486 /* After successful reclaim, reconsider all zones for allocation */
2508 if (IS_ENABLED(CONFIG_NUMA)) 2487 if (IS_ENABLED(CONFIG_NUMA))
2509 zlc_clear_zones_full(zonelist); 2488 zlc_clear_zones_full(ac->zonelist);
2510 2489
2511retry: 2490retry:
2512 page = get_page_from_freelist(gfp_mask, nodemask, order, 2491 page = get_page_from_freelist(gfp_mask, order,
2513 zonelist, high_zoneidx, 2492 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2514 alloc_flags & ~ALLOC_NO_WATERMARKS,
2515 preferred_zone, classzone_idx,
2516 migratetype);
2517 2493
2518 /* 2494 /*
2519 * If an allocation failed after direct reclaim, it could be because 2495 * If an allocation failed after direct reclaim, it could be because
@@ -2534,36 +2510,30 @@ retry:
2534 */ 2510 */
2535static inline struct page * 2511static inline struct page *
2536__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2512__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2537 struct zonelist *zonelist, enum zone_type high_zoneidx, 2513 const struct alloc_context *ac)
2538 nodemask_t *nodemask, struct zone *preferred_zone,
2539 int classzone_idx, int migratetype)
2540{ 2514{
2541 struct page *page; 2515 struct page *page;
2542 2516
2543 do { 2517 do {
2544 page = get_page_from_freelist(gfp_mask, nodemask, order, 2518 page = get_page_from_freelist(gfp_mask, order,
2545 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2519 ALLOC_NO_WATERMARKS, ac);
2546 preferred_zone, classzone_idx, migratetype);
2547 2520
2548 if (!page && gfp_mask & __GFP_NOFAIL) 2521 if (!page && gfp_mask & __GFP_NOFAIL)
2549 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2522 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2523 HZ/50);
2550 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2524 } while (!page && (gfp_mask & __GFP_NOFAIL));
2551 2525
2552 return page; 2526 return page;
2553} 2527}
2554 2528
2555static void wake_all_kswapds(unsigned int order, 2529static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
2556 struct zonelist *zonelist,
2557 enum zone_type high_zoneidx,
2558 struct zone *preferred_zone,
2559 nodemask_t *nodemask)
2560{ 2530{
2561 struct zoneref *z; 2531 struct zoneref *z;
2562 struct zone *zone; 2532 struct zone *zone;
2563 2533
2564 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2534 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2565 high_zoneidx, nodemask) 2535 ac->high_zoneidx, ac->nodemask)
2566 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2536 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
2567} 2537}
2568 2538
2569static inline int 2539static inline int
@@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2622 2592
2623static inline struct page * 2593static inline struct page *
2624__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2594__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2625 struct zonelist *zonelist, enum zone_type high_zoneidx, 2595 struct alloc_context *ac)
2626 nodemask_t *nodemask, struct zone *preferred_zone,
2627 int classzone_idx, int migratetype)
2628{ 2596{
2629 const gfp_t wait = gfp_mask & __GFP_WAIT; 2597 const gfp_t wait = gfp_mask & __GFP_WAIT;
2630 struct page *page = NULL; 2598 struct page *page = NULL;
@@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2658 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2626 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2659 goto nopage; 2627 goto nopage;
2660 2628
2661restart: 2629retry:
2662 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2630 if (!(gfp_mask & __GFP_NO_KSWAPD))
2663 wake_all_kswapds(order, zonelist, high_zoneidx, 2631 wake_all_kswapds(order, ac);
2664 preferred_zone, nodemask);
2665 2632
2666 /* 2633 /*
2667 * OK, we're below the kswapd watermark and have kicked background 2634 * OK, we're below the kswapd watermark and have kicked background
@@ -2674,18 +2641,16 @@ restart:
2674 * Find the true preferred zone if the allocation is unconstrained by 2641 * Find the true preferred zone if the allocation is unconstrained by
2675 * cpusets. 2642 * cpusets.
2676 */ 2643 */
2677 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2644 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
2678 struct zoneref *preferred_zoneref; 2645 struct zoneref *preferred_zoneref;
2679 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2646 preferred_zoneref = first_zones_zonelist(ac->zonelist,
2680 NULL, &preferred_zone); 2647 ac->high_zoneidx, NULL, &ac->preferred_zone);
2681 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2648 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
2682 } 2649 }
2683 2650
2684rebalance:
2685 /* This is the last chance, in general, before the goto nopage. */ 2651 /* This is the last chance, in general, before the goto nopage. */
2686 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2652 page = get_page_from_freelist(gfp_mask, order,
2687 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2653 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2688 preferred_zone, classzone_idx, migratetype);
2689 if (page) 2654 if (page)
2690 goto got_pg; 2655 goto got_pg;
2691 2656
@@ -2696,11 +2661,10 @@ rebalance:
2696 * the allocation is high priority and these type of 2661 * the allocation is high priority and these type of
2697 * allocations are system rather than user orientated 2662 * allocations are system rather than user orientated
2698 */ 2663 */
2699 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2664 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
2665
2666 page = __alloc_pages_high_priority(gfp_mask, order, ac);
2700 2667
2701 page = __alloc_pages_high_priority(gfp_mask, order,
2702 zonelist, high_zoneidx, nodemask,
2703 preferred_zone, classzone_idx, migratetype);
2704 if (page) { 2668 if (page) {
2705 goto got_pg; 2669 goto got_pg;
2706 } 2670 }
@@ -2729,11 +2693,9 @@ rebalance:
2729 * Try direct compaction. The first pass is asynchronous. Subsequent 2693 * Try direct compaction. The first pass is asynchronous. Subsequent
2730 * attempts after direct reclaim are synchronous 2694 * attempts after direct reclaim are synchronous
2731 */ 2695 */
2732 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2696 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
2733 high_zoneidx, nodemask, alloc_flags, 2697 migration_mode,
2734 preferred_zone, 2698 &contended_compaction,
2735 classzone_idx, migratetype,
2736 migration_mode, &contended_compaction,
2737 &deferred_compaction); 2699 &deferred_compaction);
2738 if (page) 2700 if (page)
2739 goto got_pg; 2701 goto got_pg;
@@ -2779,74 +2741,40 @@ rebalance:
2779 migration_mode = MIGRATE_SYNC_LIGHT; 2741 migration_mode = MIGRATE_SYNC_LIGHT;
2780 2742
2781 /* Try direct reclaim and then allocating */ 2743 /* Try direct reclaim and then allocating */
2782 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2744 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
2783 zonelist, high_zoneidx, 2745 &did_some_progress);
2784 nodemask,
2785 alloc_flags, preferred_zone,
2786 classzone_idx, migratetype,
2787 &did_some_progress);
2788 if (page) 2746 if (page)
2789 goto got_pg; 2747 goto got_pg;
2790 2748
2791 /*
2792 * If we failed to make any progress reclaiming, then we are
2793 * running out of options and have to consider going OOM
2794 */
2795 if (!did_some_progress) {
2796 if (oom_gfp_allowed(gfp_mask)) {
2797 if (oom_killer_disabled)
2798 goto nopage;
2799 /* Coredumps can quickly deplete all memory reserves */
2800 if ((current->flags & PF_DUMPCORE) &&
2801 !(gfp_mask & __GFP_NOFAIL))
2802 goto nopage;
2803 page = __alloc_pages_may_oom(gfp_mask, order,
2804 zonelist, high_zoneidx,
2805 nodemask, preferred_zone,
2806 classzone_idx, migratetype);
2807 if (page)
2808 goto got_pg;
2809
2810 if (!(gfp_mask & __GFP_NOFAIL)) {
2811 /*
2812 * The oom killer is not called for high-order
2813 * allocations that may fail, so if no progress
2814 * is being made, there are no other options and
2815 * retrying is unlikely to help.
2816 */
2817 if (order > PAGE_ALLOC_COSTLY_ORDER)
2818 goto nopage;
2819 /*
2820 * The oom killer is not called for lowmem
2821 * allocations to prevent needlessly killing
2822 * innocent tasks.
2823 */
2824 if (high_zoneidx < ZONE_NORMAL)
2825 goto nopage;
2826 }
2827
2828 goto restart;
2829 }
2830 }
2831
2832 /* Check if we should retry the allocation */ 2749 /* Check if we should retry the allocation */
2833 pages_reclaimed += did_some_progress; 2750 pages_reclaimed += did_some_progress;
2834 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2751 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2835 pages_reclaimed)) { 2752 pages_reclaimed)) {
2753 /*
2754 * If we fail to make progress by freeing individual
2755 * pages, but the allocation wants us to keep going,
2756 * start OOM killing tasks.
2757 */
2758 if (!did_some_progress) {
2759 page = __alloc_pages_may_oom(gfp_mask, order, ac,
2760 &did_some_progress);
2761 if (page)
2762 goto got_pg;
2763 if (!did_some_progress)
2764 goto nopage;
2765 }
2836 /* Wait for some write requests to complete then retry */ 2766 /* Wait for some write requests to complete then retry */
2837 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2767 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
2838 goto rebalance; 2768 goto retry;
2839 } else { 2769 } else {
2840 /* 2770 /*
2841 * High-order allocations do not necessarily loop after 2771 * High-order allocations do not necessarily loop after
2842 * direct reclaim and reclaim/compaction depends on compaction 2772 * direct reclaim and reclaim/compaction depends on compaction
2843 * being called after reclaim so call directly if necessary 2773 * being called after reclaim so call directly if necessary
2844 */ 2774 */
2845 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2775 page = __alloc_pages_direct_compact(gfp_mask, order,
2846 high_zoneidx, nodemask, alloc_flags, 2776 alloc_flags, ac, migration_mode,
2847 preferred_zone, 2777 &contended_compaction,
2848 classzone_idx, migratetype,
2849 migration_mode, &contended_compaction,
2850 &deferred_compaction); 2778 &deferred_compaction);
2851 if (page) 2779 if (page)
2852 goto got_pg; 2780 goto got_pg;
@@ -2854,11 +2782,7 @@ rebalance:
2854 2782
2855nopage: 2783nopage:
2856 warn_alloc_failed(gfp_mask, order, NULL); 2784 warn_alloc_failed(gfp_mask, order, NULL);
2857 return page;
2858got_pg: 2785got_pg:
2859 if (kmemcheck_enabled)
2860 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2861
2862 return page; 2786 return page;
2863} 2787}
2864 2788
@@ -2869,14 +2793,16 @@ struct page *
2869__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2793__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2870 struct zonelist *zonelist, nodemask_t *nodemask) 2794 struct zonelist *zonelist, nodemask_t *nodemask)
2871{ 2795{
2872 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2873 struct zone *preferred_zone;
2874 struct zoneref *preferred_zoneref; 2796 struct zoneref *preferred_zoneref;
2875 struct page *page = NULL; 2797 struct page *page = NULL;
2876 int migratetype = gfpflags_to_migratetype(gfp_mask);
2877 unsigned int cpuset_mems_cookie; 2798 unsigned int cpuset_mems_cookie;
2878 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2799 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2879 int classzone_idx; 2800 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
2801 struct alloc_context ac = {
2802 .high_zoneidx = gfp_zone(gfp_mask),
2803 .nodemask = nodemask,
2804 .migratetype = gfpflags_to_migratetype(gfp_mask),
2805 };
2880 2806
2881 gfp_mask &= gfp_allowed_mask; 2807 gfp_mask &= gfp_allowed_mask;
2882 2808
@@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2895 if (unlikely(!zonelist->_zonerefs->zone)) 2821 if (unlikely(!zonelist->_zonerefs->zone))
2896 return NULL; 2822 return NULL;
2897 2823
2898 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) 2824 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
2899 alloc_flags |= ALLOC_CMA; 2825 alloc_flags |= ALLOC_CMA;
2900 2826
2901retry_cpuset: 2827retry_cpuset:
2902 cpuset_mems_cookie = read_mems_allowed_begin(); 2828 cpuset_mems_cookie = read_mems_allowed_begin();
2903 2829
2830 /* We set it here, as __alloc_pages_slowpath might have changed it */
2831 ac.zonelist = zonelist;
2904 /* The preferred zone is used for statistics later */ 2832 /* The preferred zone is used for statistics later */
2905 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2833 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
2906 nodemask ? : &cpuset_current_mems_allowed, 2834 ac.nodemask ? : &cpuset_current_mems_allowed,
2907 &preferred_zone); 2835 &ac.preferred_zone);
2908 if (!preferred_zone) 2836 if (!ac.preferred_zone)
2909 goto out; 2837 goto out;
2910 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2838 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
2911 2839
2912 /* First allocation attempt */ 2840 /* First allocation attempt */
2913 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2841 alloc_mask = gfp_mask|__GFP_HARDWALL;
2914 zonelist, high_zoneidx, alloc_flags, 2842 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
2915 preferred_zone, classzone_idx, migratetype);
2916 if (unlikely(!page)) { 2843 if (unlikely(!page)) {
2917 /* 2844 /*
2918 * Runtime PM, block IO and its error handling path 2845 * Runtime PM, block IO and its error handling path
2919 * can deadlock because I/O on the device might not 2846 * can deadlock because I/O on the device might not
2920 * complete. 2847 * complete.
2921 */ 2848 */
2922 gfp_mask = memalloc_noio_flags(gfp_mask); 2849 alloc_mask = memalloc_noio_flags(gfp_mask);
2923 page = __alloc_pages_slowpath(gfp_mask, order, 2850
2924 zonelist, high_zoneidx, nodemask, 2851 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
2925 preferred_zone, classzone_idx, migratetype);
2926 } 2852 }
2927 2853
2928 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2854 if (kmemcheck_enabled && page)
2855 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2856
2857 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
2929 2858
2930out: 2859out:
2931 /* 2860 /*
@@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data)
3945 return 0; 3874 return 0;
3946} 3875}
3947 3876
3877static noinline void __init
3878build_all_zonelists_init(void)
3879{
3880 __build_all_zonelists(NULL);
3881 mminit_verify_zonelist();
3882 cpuset_init_current_mems_allowed();
3883}
3884
3948/* 3885/*
3949 * Called with zonelists_mutex held always 3886 * Called with zonelists_mutex held always
3950 * unless system_state == SYSTEM_BOOTING. 3887 * unless system_state == SYSTEM_BOOTING.
3888 *
3889 * __ref due to (1) call of __meminit annotated setup_zone_pageset
3890 * [we're only called with non-NULL zone through __meminit paths] and
3891 * (2) call of __init annotated helper build_all_zonelists_init
3892 * [protected by SYSTEM_BOOTING].
3951 */ 3893 */
3952void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3894void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3953{ 3895{
3954 set_zonelist_order(); 3896 set_zonelist_order();
3955 3897
3956 if (system_state == SYSTEM_BOOTING) { 3898 if (system_state == SYSTEM_BOOTING) {
3957 __build_all_zonelists(NULL); 3899 build_all_zonelists_init();
3958 mminit_verify_zonelist();
3959 cpuset_init_current_mems_allowed();
3960 } else { 3900 } else {
3961#ifdef CONFIG_MEMORY_HOTPLUG 3901#ifdef CONFIG_MEMORY_HOTPLUG
3962 if (zone) 3902 if (zone)
@@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5059 pgdat->node_start_pfn = node_start_pfn; 4999 pgdat->node_start_pfn = node_start_pfn;
5060#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5000#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5061 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5001 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5062 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, 5002 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5063 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); 5003 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
5064#endif 5004#endif
5065 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5005 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5066 zones_size, zholes_size); 5006 zones_size, zholes_size);
@@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5432 arch_zone_highest_possible_pfn[i]) 5372 arch_zone_highest_possible_pfn[i])
5433 pr_cont("empty\n"); 5373 pr_cont("empty\n");
5434 else 5374 else
5435 pr_cont("[mem %0#10lx-%0#10lx]\n", 5375 pr_cont("[mem %#018Lx-%#018Lx]\n",
5436 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5376 (u64)arch_zone_lowest_possible_pfn[i]
5437 (arch_zone_highest_possible_pfn[i] 5377 << PAGE_SHIFT,
5378 ((u64)arch_zone_highest_possible_pfn[i]
5438 << PAGE_SHIFT) - 1); 5379 << PAGE_SHIFT) - 1);
5439 } 5380 }
5440 5381
@@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5442 pr_info("Movable zone start for each node\n"); 5383 pr_info("Movable zone start for each node\n");
5443 for (i = 0; i < MAX_NUMNODES; i++) { 5384 for (i = 0; i < MAX_NUMNODES; i++) {
5444 if (zone_movable_pfn[i]) 5385 if (zone_movable_pfn[i])
5445 pr_info(" Node %d: %#010lx\n", i, 5386 pr_info(" Node %d: %#018Lx\n", i,
5446 zone_movable_pfn[i] << PAGE_SHIFT); 5387 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
5447 } 5388 }
5448 5389
5449 /* Print out the early node map */ 5390 /* Print out the early node map */
5450 pr_info("Early memory node ranges\n"); 5391 pr_info("Early memory node ranges\n");
5451 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5392 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5452 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5393 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
5453 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5394 (u64)start_pfn << PAGE_SHIFT,
5395 ((u64)end_pfn << PAGE_SHIFT) - 1);
5454 5396
5455 /* Initialise every node */ 5397 /* Initialise every node */
5456 mminit_verify_pageflags_layout(); 5398 mminit_verify_pageflags_layout();