aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c471
1 files changed, 216 insertions, 255 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e20f9c2fa5a..7abfa70cdc1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
28#include <linux/kasan.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include <linux/suspend.h> 30#include <linux/suspend.h>
30#include <linux/pagevec.h> 31#include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
172 * 1G machine -> (16M dma, 784M normal, 224M high) 173 * 1G machine -> (16M dma, 784M normal, 224M high)
173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 174 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 175 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
175 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 176 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
176 * 177 *
177 * TBD: should special case ZONE_DMA32 machines here - in those we normally 178 * TBD: should special case ZONE_DMA32 machines here - in those we normally
178 * don't need any ZONE_NORMAL reservation 179 * don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244 PB_migrate, PB_migrate_end); 245 PB_migrate, PB_migrate_end);
245} 246}
246 247
247bool oom_killer_disabled __read_mostly;
248
249#ifdef CONFIG_DEBUG_VM 248#ifdef CONFIG_DEBUG_VM
250static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
251{ 250{
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
381 } 380 }
382} 381}
383 382
384/* update __split_huge_page_refcount if you change this function */
385static int destroy_compound_page(struct page *page, unsigned long order)
386{
387 int i;
388 int nr_pages = 1 << order;
389 int bad = 0;
390
391 if (unlikely(compound_order(page) != order)) {
392 bad_page(page, "wrong compound order", 0);
393 bad++;
394 }
395
396 __ClearPageHead(page);
397
398 for (i = 1; i < nr_pages; i++) {
399 struct page *p = page + i;
400
401 if (unlikely(!PageTail(p))) {
402 bad_page(page, "PageTail not set", 0);
403 bad++;
404 } else if (unlikely(p->first_page != page)) {
405 bad_page(page, "first_page not consistent", 0);
406 bad++;
407 }
408 __ClearPageTail(p);
409 }
410
411 return bad;
412}
413
414static inline void prep_zero_page(struct page *page, unsigned int order, 383static inline void prep_zero_page(struct page *page, unsigned int order,
415 gfp_t gfp_flags) 384 gfp_t gfp_flags)
416{ 385{
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
552 return 0; 521 return 0;
553 522
554 if (page_is_guard(buddy) && page_order(buddy) == order) { 523 if (page_is_guard(buddy) && page_order(buddy) == order) {
555 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
556
557 if (page_zone_id(page) != page_zone_id(buddy)) 524 if (page_zone_id(page) != page_zone_id(buddy))
558 return 0; 525 return 0;
559 526
527 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
528
560 return 1; 529 return 1;
561 } 530 }
562 531
563 if (PageBuddy(buddy) && page_order(buddy) == order) { 532 if (PageBuddy(buddy) && page_order(buddy) == order) {
564 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
565
566 /* 533 /*
567 * zone check is done late to avoid uselessly 534 * zone check is done late to avoid uselessly
568 * calculating zone/node ids for pages that could 535 * calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
571 if (page_zone_id(page) != page_zone_id(buddy)) 538 if (page_zone_id(page) != page_zone_id(buddy))
572 return 0; 539 return 0;
573 540
541 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
542
574 return 1; 543 return 1;
575 } 544 }
576 return 0; 545 return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
613 int max_order = MAX_ORDER; 582 int max_order = MAX_ORDER;
614 583
615 VM_BUG_ON(!zone_is_initialized(zone)); 584 VM_BUG_ON(!zone_is_initialized(zone));
616 585 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
617 if (unlikely(PageCompound(page)))
618 if (unlikely(destroy_compound_page(page, order)))
619 return;
620 586
621 VM_BUG_ON(migratetype == -1); 587 VM_BUG_ON(migratetype == -1);
622 if (is_migrate_isolate(migratetype)) { 588 if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
797 spin_unlock(&zone->lock); 763 spin_unlock(&zone->lock);
798} 764}
799 765
766static int free_tail_pages_check(struct page *head_page, struct page *page)
767{
768 if (!IS_ENABLED(CONFIG_DEBUG_VM))
769 return 0;
770 if (unlikely(!PageTail(page))) {
771 bad_page(page, "PageTail not set", 0);
772 return 1;
773 }
774 if (unlikely(page->first_page != head_page)) {
775 bad_page(page, "first_page not consistent", 0);
776 return 1;
777 }
778 return 0;
779}
780
800static bool free_pages_prepare(struct page *page, unsigned int order) 781static bool free_pages_prepare(struct page *page, unsigned int order)
801{ 782{
802 int i; 783 bool compound = PageCompound(page);
803 int bad = 0; 784 int i, bad = 0;
804 785
805 VM_BUG_ON_PAGE(PageTail(page), page); 786 VM_BUG_ON_PAGE(PageTail(page), page);
806 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); 787 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
807 788
808 trace_mm_page_free(page, order); 789 trace_mm_page_free(page, order);
809 kmemcheck_free_shadow(page, order); 790 kmemcheck_free_shadow(page, order);
791 kasan_free_pages(page, order);
810 792
811 if (PageAnon(page)) 793 if (PageAnon(page))
812 page->mapping = NULL; 794 page->mapping = NULL;
813 for (i = 0; i < (1 << order); i++) 795 bad += free_pages_check(page);
796 for (i = 1; i < (1 << order); i++) {
797 if (compound)
798 bad += free_tail_pages_check(page, page + i);
814 bad += free_pages_check(page + i); 799 bad += free_pages_check(page + i);
800 }
815 if (bad) 801 if (bad)
816 return false; 802 return false;
817 803
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
970 return 0; 956 return 0;
971} 957}
972 958
973static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 959static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
960 int alloc_flags)
974{ 961{
975 int i; 962 int i;
976 963
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
985 972
986 arch_alloc_page(page, order); 973 arch_alloc_page(page, order);
987 kernel_map_pages(page, 1 << order, 1); 974 kernel_map_pages(page, 1 << order, 1);
975 kasan_alloc_pages(page, order);
988 976
989 if (gfp_flags & __GFP_ZERO) 977 if (gfp_flags & __GFP_ZERO)
990 prep_zero_page(page, order, gfp_flags); 978 prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
994 982
995 set_page_owner(page, order, gfp_flags); 983 set_page_owner(page, order, gfp_flags);
996 984
985 /*
986 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
987 * allocate the page. The expectation is that the caller is taking
988 * steps that will free more memory. The caller should avoid the page
989 * being used for !PFMEMALLOC purposes.
990 */
991 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
992
997 return 0; 993 return 0;
998} 994}
999 995
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
1130} 1126}
1131 1127
1132/* 1128/*
1133 * If breaking a large block of pages, move all free pages to the preferred 1129 * When we are falling back to another migratetype during allocation, try to
1134 * allocation list. If falling back for a reclaimable kernel allocation, be 1130 * steal extra free pages from the same pageblocks to satisfy further
1135 * more aggressive about taking ownership of free pages. 1131 * allocations, instead of polluting multiple pageblocks.
1136 * 1132 *
1137 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1133 * If we are stealing a relatively large buddy page, it is likely there will
1138 * nor move CMA pages to different free lists. We don't want unmovable pages 1134 * be more free pages in the pageblock, so try to steal them all. For
1139 * to be allocated from MIGRATE_CMA areas. 1135 * reclaimable and unmovable allocations, we steal regardless of page size,
1136 * as fragmentation caused by those allocations polluting movable pageblocks
1137 * is worse than movable allocations stealing from unmovable and reclaimable
1138 * pageblocks.
1140 * 1139 *
1141 * Returns the new migratetype of the pageblock (or the same old migratetype 1140 * If we claim more than half of the pageblock, change pageblock's migratetype
1142 * if it was unchanged). 1141 * as well.
1143 */ 1142 */
1144static int try_to_steal_freepages(struct zone *zone, struct page *page, 1143static void try_to_steal_freepages(struct zone *zone, struct page *page,
1145 int start_type, int fallback_type) 1144 int start_type, int fallback_type)
1146{ 1145{
1147 int current_order = page_order(page); 1146 int current_order = page_order(page);
1148 1147
1149 /*
1150 * When borrowing from MIGRATE_CMA, we need to release the excess
1151 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1152 * is set to CMA so it is returned to the correct freelist in case
1153 * the page ends up being not actually allocated from the pcp lists.
1154 */
1155 if (is_migrate_cma(fallback_type))
1156 return fallback_type;
1157
1158 /* Take ownership for orders >= pageblock_order */ 1148 /* Take ownership for orders >= pageblock_order */
1159 if (current_order >= pageblock_order) { 1149 if (current_order >= pageblock_order) {
1160 change_pageblock_range(page, current_order, start_type); 1150 change_pageblock_range(page, current_order, start_type);
1161 return start_type; 1151 return;
1162 } 1152 }
1163 1153
1164 if (current_order >= pageblock_order / 2 || 1154 if (current_order >= pageblock_order / 2 ||
1165 start_type == MIGRATE_RECLAIMABLE || 1155 start_type == MIGRATE_RECLAIMABLE ||
1156 start_type == MIGRATE_UNMOVABLE ||
1166 page_group_by_mobility_disabled) { 1157 page_group_by_mobility_disabled) {
1167 int pages; 1158 int pages;
1168 1159
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1170 1161
1171 /* Claim the whole block if over half of it is free */ 1162 /* Claim the whole block if over half of it is free */
1172 if (pages >= (1 << (pageblock_order-1)) || 1163 if (pages >= (1 << (pageblock_order-1)) ||
1173 page_group_by_mobility_disabled) { 1164 page_group_by_mobility_disabled)
1174
1175 set_pageblock_migratetype(page, start_type); 1165 set_pageblock_migratetype(page, start_type);
1176 return start_type;
1177 }
1178
1179 } 1166 }
1180
1181 return fallback_type;
1182} 1167}
1183 1168
1184/* Remove an element from the buddy allocator from the fallback list */ 1169/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1188 struct free_area *area; 1173 struct free_area *area;
1189 unsigned int current_order; 1174 unsigned int current_order;
1190 struct page *page; 1175 struct page *page;
1191 int migratetype, new_type, i;
1192 1176
1193 /* Find the largest possible block of pages in the other list */ 1177 /* Find the largest possible block of pages in the other list */
1194 for (current_order = MAX_ORDER-1; 1178 for (current_order = MAX_ORDER-1;
1195 current_order >= order && current_order <= MAX_ORDER-1; 1179 current_order >= order && current_order <= MAX_ORDER-1;
1196 --current_order) { 1180 --current_order) {
1181 int i;
1197 for (i = 0;; i++) { 1182 for (i = 0;; i++) {
1198 migratetype = fallbacks[start_migratetype][i]; 1183 int migratetype = fallbacks[start_migratetype][i];
1184 int buddy_type = start_migratetype;
1199 1185
1200 /* MIGRATE_RESERVE handled later if necessary */ 1186 /* MIGRATE_RESERVE handled later if necessary */
1201 if (migratetype == MIGRATE_RESERVE) 1187 if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1209 struct page, lru); 1195 struct page, lru);
1210 area->nr_free--; 1196 area->nr_free--;
1211 1197
1212 new_type = try_to_steal_freepages(zone, page, 1198 if (!is_migrate_cma(migratetype)) {
1213 start_migratetype, 1199 try_to_steal_freepages(zone, page,
1214 migratetype); 1200 start_migratetype,
1201 migratetype);
1202 } else {
1203 /*
1204 * When borrowing from MIGRATE_CMA, we need to
1205 * release the excess buddy pages to CMA
1206 * itself, and we do not try to steal extra
1207 * free pages.
1208 */
1209 buddy_type = migratetype;
1210 }
1215 1211
1216 /* Remove the page from the freelists */ 1212 /* Remove the page from the freelists */
1217 list_del(&page->lru); 1213 list_del(&page->lru);
1218 rmv_page_order(page); 1214 rmv_page_order(page);
1219 1215
1220 expand(zone, page, order, current_order, area, 1216 expand(zone, page, order, current_order, area,
1221 new_type); 1217 buddy_type);
1222 /* The freepage_migratetype may differ from pageblock's 1218
1219 /*
1220 * The freepage_migratetype may differ from pageblock's
1223 * migratetype depending on the decisions in 1221 * migratetype depending on the decisions in
1224 * try_to_steal_freepages. This is OK as long as it does 1222 * try_to_steal_freepages(). This is OK as long as it
1225 * not differ for MIGRATE_CMA type. 1223 * does not differ for MIGRATE_CMA pageblocks. For CMA
1224 * we need to make sure unallocated pages flushed from
1225 * pcp lists are returned to the correct freelist.
1226 */ 1226 */
1227 set_freepage_migratetype(page, new_type); 1227 set_freepage_migratetype(page, buddy_type);
1228 1228
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1229 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype, new_type); 1230 start_migratetype, migratetype);
1231 1231
1232 return page; 1232 return page;
1233 } 1233 }
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
1642} 1642}
1643 1643
1644/* 1644/*
1645 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1645 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
1646 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1647 * or two.
1648 */ 1646 */
1649static inline 1647static inline
1650struct page *buffered_rmqueue(struct zone *preferred_zone, 1648struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1655 struct page *page; 1653 struct page *page;
1656 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1654 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1657 1655
1658again:
1659 if (likely(order == 0)) { 1656 if (likely(order == 0)) {
1660 struct per_cpu_pages *pcp; 1657 struct per_cpu_pages *pcp;
1661 struct list_head *list; 1658 struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
1711 local_irq_restore(flags); 1708 local_irq_restore(flags);
1712 1709
1713 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1710 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1714 if (prep_new_page(page, order, gfp_flags))
1715 goto again;
1716 return page; 1711 return page;
1717 1712
1718failed: 1713failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
2033 * a page. 2028 * a page.
2034 */ 2029 */
2035static struct page * 2030static struct page *
2036get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 2031get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2037 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 2032 const struct alloc_context *ac)
2038 struct zone *preferred_zone, int classzone_idx, int migratetype)
2039{ 2033{
2034 struct zonelist *zonelist = ac->zonelist;
2040 struct zoneref *z; 2035 struct zoneref *z;
2041 struct page *page = NULL; 2036 struct page *page = NULL;
2042 struct zone *zone; 2037 struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
2055 * Scan zonelist, looking for a zone with enough free. 2050 * Scan zonelist, looking for a zone with enough free.
2056 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2051 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
2057 */ 2052 */
2058 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2053 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2059 high_zoneidx, nodemask) { 2054 ac->nodemask) {
2060 unsigned long mark; 2055 unsigned long mark;
2061 2056
2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2057 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
2073 * time the page has in memory before being reclaimed. 2068 * time the page has in memory before being reclaimed.
2074 */ 2069 */
2075 if (alloc_flags & ALLOC_FAIR) { 2070 if (alloc_flags & ALLOC_FAIR) {
2076 if (!zone_local(preferred_zone, zone)) 2071 if (!zone_local(ac->preferred_zone, zone))
2077 break; 2072 break;
2078 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { 2073 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
2079 nr_fair_skipped++; 2074 nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
2111 2106
2112 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2107 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2113 if (!zone_watermark_ok(zone, order, mark, 2108 if (!zone_watermark_ok(zone, order, mark,
2114 classzone_idx, alloc_flags)) { 2109 ac->classzone_idx, alloc_flags)) {
2115 int ret; 2110 int ret;
2116 2111
2117 /* Checked here to keep the fast path fast */ 2112 /* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
2132 } 2127 }
2133 2128
2134 if (zone_reclaim_mode == 0 || 2129 if (zone_reclaim_mode == 0 ||
2135 !zone_allows_reclaim(preferred_zone, zone)) 2130 !zone_allows_reclaim(ac->preferred_zone, zone))
2136 goto this_zone_full; 2131 goto this_zone_full;
2137 2132
2138 /* 2133 /*
@@ -2154,7 +2149,7 @@ zonelist_scan:
2154 default: 2149 default:
2155 /* did we reclaim enough */ 2150 /* did we reclaim enough */
2156 if (zone_watermark_ok(zone, order, mark, 2151 if (zone_watermark_ok(zone, order, mark,
2157 classzone_idx, alloc_flags)) 2152 ac->classzone_idx, alloc_flags))
2158 goto try_this_zone; 2153 goto try_this_zone;
2159 2154
2160 /* 2155 /*
@@ -2175,27 +2170,18 @@ zonelist_scan:
2175 } 2170 }
2176 2171
2177try_this_zone: 2172try_this_zone:
2178 page = buffered_rmqueue(preferred_zone, zone, order, 2173 page = buffered_rmqueue(ac->preferred_zone, zone, order,
2179 gfp_mask, migratetype); 2174 gfp_mask, ac->migratetype);
2180 if (page) 2175 if (page) {
2181 break; 2176 if (prep_new_page(page, order, gfp_mask, alloc_flags))
2177 goto try_this_zone;
2178 return page;
2179 }
2182this_zone_full: 2180this_zone_full:
2183 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2181 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2184 zlc_mark_zone_full(zonelist, z); 2182 zlc_mark_zone_full(zonelist, z);
2185 } 2183 }
2186 2184
2187 if (page) {
2188 /*
2189 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2190 * necessary to allocate the page. The expectation is
2191 * that the caller is taking steps that will free more
2192 * memory. The caller should avoid the page being used
2193 * for !PFMEMALLOC purposes.
2194 */
2195 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2196 return page;
2197 }
2198
2199 /* 2185 /*
2200 * The first pass makes sure allocations are spread fairly within the 2186 * The first pass makes sure allocations are spread fairly within the
2201 * local node. However, the local node might have free pages left 2187 * local node. However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
2208 alloc_flags &= ~ALLOC_FAIR; 2194 alloc_flags &= ~ALLOC_FAIR;
2209 if (nr_fair_skipped) { 2195 if (nr_fair_skipped) {
2210 zonelist_rescan = true; 2196 zonelist_rescan = true;
2211 reset_alloc_batches(preferred_zone); 2197 reset_alloc_batches(ac->preferred_zone);
2212 } 2198 }
2213 if (nr_online_nodes > 1) 2199 if (nr_online_nodes > 1)
2214 zonelist_rescan = true; 2200 zonelist_rescan = true;
@@ -2330,44 +2316,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2330 2316
2331static inline struct page * 2317static inline struct page *
2332__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2318__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2333 struct zonelist *zonelist, enum zone_type high_zoneidx, 2319 const struct alloc_context *ac, unsigned long *did_some_progress)
2334 nodemask_t *nodemask, struct zone *preferred_zone,
2335 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2336{ 2320{
2337 struct page *page; 2321 struct page *page;
2338 2322
2339 *did_some_progress = 0; 2323 *did_some_progress = 0;
2340 2324
2341 if (oom_killer_disabled)
2342 return NULL;
2343
2344 /* 2325 /*
2345 * Acquire the per-zone oom lock for each zone. If that 2326 * Acquire the per-zone oom lock for each zone. If that
2346 * fails, somebody else is making progress for us. 2327 * fails, somebody else is making progress for us.
2347 */ 2328 */
2348 if (!oom_zonelist_trylock(zonelist, gfp_mask)) { 2329 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
2349 *did_some_progress = 1; 2330 *did_some_progress = 1;
2350 schedule_timeout_uninterruptible(1); 2331 schedule_timeout_uninterruptible(1);
2351 return NULL; 2332 return NULL;
2352 } 2333 }
2353 2334
2354 /* 2335 /*
2355 * PM-freezer should be notified that there might be an OOM killer on
2356 * its way to kill and wake somebody up. This is too early and we might
2357 * end up not killing anything but false positives are acceptable.
2358 * See freeze_processes.
2359 */
2360 note_oom_kill();
2361
2362 /*
2363 * Go through the zonelist yet one more time, keep very high watermark 2336 * Go through the zonelist yet one more time, keep very high watermark
2364 * here, this is only to catch a parallel oom killing, we must fail if 2337 * here, this is only to catch a parallel oom killing, we must fail if
2365 * we're still under heavy pressure. 2338 * we're still under heavy pressure.
2366 */ 2339 */
2367 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2340 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
2368 order, zonelist, high_zoneidx, 2341 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
2369 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2370 preferred_zone, classzone_idx, migratetype);
2371 if (page) 2342 if (page)
2372 goto out; 2343 goto out;
2373 2344
@@ -2379,11 +2350,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2379 if (order > PAGE_ALLOC_COSTLY_ORDER) 2350 if (order > PAGE_ALLOC_COSTLY_ORDER)
2380 goto out; 2351 goto out;
2381 /* The OOM killer does not needlessly kill tasks for lowmem */ 2352 /* The OOM killer does not needlessly kill tasks for lowmem */
2382 if (high_zoneidx < ZONE_NORMAL) 2353 if (ac->high_zoneidx < ZONE_NORMAL)
2383 goto out; 2354 goto out;
2384 /* The OOM killer does not compensate for light reclaim */ 2355 /* The OOM killer does not compensate for light reclaim */
2385 if (!(gfp_mask & __GFP_FS)) 2356 if (!(gfp_mask & __GFP_FS)) {
2357 /*
2358 * XXX: Page reclaim didn't yield anything,
2359 * and the OOM killer can't be invoked, but
2360 * keep looping as per should_alloc_retry().
2361 */
2362 *did_some_progress = 1;
2386 goto out; 2363 goto out;
2364 }
2387 /* 2365 /*
2388 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2366 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2389 * Sanity check for bare calls of __GFP_THISNODE, not real OOM. 2367 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2395,10 +2373,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2395 goto out; 2373 goto out;
2396 } 2374 }
2397 /* Exhausted what can be done so it's blamo time */ 2375 /* Exhausted what can be done so it's blamo time */
2398 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2376 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2399 *did_some_progress = 1; 2377 *did_some_progress = 1;
2400out: 2378out:
2401 oom_zonelist_unlock(zonelist, gfp_mask); 2379 oom_zonelist_unlock(ac->zonelist, gfp_mask);
2402 return page; 2380 return page;
2403} 2381}
2404 2382
@@ -2406,10 +2384,9 @@ out:
2406/* Try memory compaction for high-order allocations before reclaim */ 2384/* Try memory compaction for high-order allocations before reclaim */
2407static struct page * 2385static struct page *
2408__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2386__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2409 struct zonelist *zonelist, enum zone_type high_zoneidx, 2387 int alloc_flags, const struct alloc_context *ac,
2410 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2388 enum migrate_mode mode, int *contended_compaction,
2411 int classzone_idx, int migratetype, enum migrate_mode mode, 2389 bool *deferred_compaction)
2412 int *contended_compaction, bool *deferred_compaction)
2413{ 2390{
2414 unsigned long compact_result; 2391 unsigned long compact_result;
2415 struct page *page; 2392 struct page *page;
@@ -2418,10 +2395,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2418 return NULL; 2395 return NULL;
2419 2396
2420 current->flags |= PF_MEMALLOC; 2397 current->flags |= PF_MEMALLOC;
2421 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2398 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
2422 nodemask, mode, 2399 mode, contended_compaction);
2423 contended_compaction,
2424 alloc_flags, classzone_idx);
2425 current->flags &= ~PF_MEMALLOC; 2400 current->flags &= ~PF_MEMALLOC;
2426 2401
2427 switch (compact_result) { 2402 switch (compact_result) {
@@ -2440,10 +2415,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2440 */ 2415 */
2441 count_vm_event(COMPACTSTALL); 2416 count_vm_event(COMPACTSTALL);
2442 2417
2443 page = get_page_from_freelist(gfp_mask, nodemask, 2418 page = get_page_from_freelist(gfp_mask, order,
2444 order, zonelist, high_zoneidx, 2419 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2445 alloc_flags & ~ALLOC_NO_WATERMARKS,
2446 preferred_zone, classzone_idx, migratetype);
2447 2420
2448 if (page) { 2421 if (page) {
2449 struct zone *zone = page_zone(page); 2422 struct zone *zone = page_zone(page);
@@ -2467,10 +2440,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2467#else 2440#else
2468static inline struct page * 2441static inline struct page *
2469__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2442__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2470 struct zonelist *zonelist, enum zone_type high_zoneidx, 2443 int alloc_flags, const struct alloc_context *ac,
2471 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2444 enum migrate_mode mode, int *contended_compaction,
2472 int classzone_idx, int migratetype, enum migrate_mode mode, 2445 bool *deferred_compaction)
2473 int *contended_compaction, bool *deferred_compaction)
2474{ 2446{
2475 return NULL; 2447 return NULL;
2476} 2448}
@@ -2478,8 +2450,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2478 2450
2479/* Perform direct synchronous page reclaim */ 2451/* Perform direct synchronous page reclaim */
2480static int 2452static int
2481__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2453__perform_reclaim(gfp_t gfp_mask, unsigned int order,
2482 nodemask_t *nodemask) 2454 const struct alloc_context *ac)
2483{ 2455{
2484 struct reclaim_state reclaim_state; 2456 struct reclaim_state reclaim_state;
2485 int progress; 2457 int progress;
@@ -2493,7 +2465,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2493 reclaim_state.reclaimed_slab = 0; 2465 reclaim_state.reclaimed_slab = 0;
2494 current->reclaim_state = &reclaim_state; 2466 current->reclaim_state = &reclaim_state;
2495 2467
2496 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2468 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
2469 ac->nodemask);
2497 2470
2498 current->reclaim_state = NULL; 2471 current->reclaim_state = NULL;
2499 lockdep_clear_current_reclaim_state(); 2472 lockdep_clear_current_reclaim_state();
@@ -2507,28 +2480,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2507/* The really slow allocator path where we enter direct reclaim */ 2480/* The really slow allocator path where we enter direct reclaim */
2508static inline struct page * 2481static inline struct page *
2509__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2482__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2510 struct zonelist *zonelist, enum zone_type high_zoneidx, 2483 int alloc_flags, const struct alloc_context *ac,
2511 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2484 unsigned long *did_some_progress)
2512 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2513{ 2485{
2514 struct page *page = NULL; 2486 struct page *page = NULL;
2515 bool drained = false; 2487 bool drained = false;
2516 2488
2517 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2489 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
2518 nodemask);
2519 if (unlikely(!(*did_some_progress))) 2490 if (unlikely(!(*did_some_progress)))
2520 return NULL; 2491 return NULL;
2521 2492
2522 /* After successful reclaim, reconsider all zones for allocation */ 2493 /* After successful reclaim, reconsider all zones for allocation */
2523 if (IS_ENABLED(CONFIG_NUMA)) 2494 if (IS_ENABLED(CONFIG_NUMA))
2524 zlc_clear_zones_full(zonelist); 2495 zlc_clear_zones_full(ac->zonelist);
2525 2496
2526retry: 2497retry:
2527 page = get_page_from_freelist(gfp_mask, nodemask, order, 2498 page = get_page_from_freelist(gfp_mask, order,
2528 zonelist, high_zoneidx, 2499 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2529 alloc_flags & ~ALLOC_NO_WATERMARKS,
2530 preferred_zone, classzone_idx,
2531 migratetype);
2532 2500
2533 /* 2501 /*
2534 * If an allocation failed after direct reclaim, it could be because 2502 * If an allocation failed after direct reclaim, it could be because
@@ -2549,36 +2517,30 @@ retry:
2549 */ 2517 */
2550static inline struct page * 2518static inline struct page *
2551__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2519__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2552 struct zonelist *zonelist, enum zone_type high_zoneidx, 2520 const struct alloc_context *ac)
2553 nodemask_t *nodemask, struct zone *preferred_zone,
2554 int classzone_idx, int migratetype)
2555{ 2521{
2556 struct page *page; 2522 struct page *page;
2557 2523
2558 do { 2524 do {
2559 page = get_page_from_freelist(gfp_mask, nodemask, order, 2525 page = get_page_from_freelist(gfp_mask, order,
2560 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2526 ALLOC_NO_WATERMARKS, ac);
2561 preferred_zone, classzone_idx, migratetype);
2562 2527
2563 if (!page && gfp_mask & __GFP_NOFAIL) 2528 if (!page && gfp_mask & __GFP_NOFAIL)
2564 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2529 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2530 HZ/50);
2565 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2531 } while (!page && (gfp_mask & __GFP_NOFAIL));
2566 2532
2567 return page; 2533 return page;
2568} 2534}
2569 2535
2570static void wake_all_kswapds(unsigned int order, 2536static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
2571 struct zonelist *zonelist,
2572 enum zone_type high_zoneidx,
2573 struct zone *preferred_zone,
2574 nodemask_t *nodemask)
2575{ 2537{
2576 struct zoneref *z; 2538 struct zoneref *z;
2577 struct zone *zone; 2539 struct zone *zone;
2578 2540
2579 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2541 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2580 high_zoneidx, nodemask) 2542 ac->high_zoneidx, ac->nodemask)
2581 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2543 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
2582} 2544}
2583 2545
2584static inline int 2546static inline int
@@ -2637,9 +2599,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2637 2599
2638static inline struct page * 2600static inline struct page *
2639__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2601__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2640 struct zonelist *zonelist, enum zone_type high_zoneidx, 2602 struct alloc_context *ac)
2641 nodemask_t *nodemask, struct zone *preferred_zone,
2642 int classzone_idx, int migratetype)
2643{ 2603{
2644 const gfp_t wait = gfp_mask & __GFP_WAIT; 2604 const gfp_t wait = gfp_mask & __GFP_WAIT;
2645 struct page *page = NULL; 2605 struct page *page = NULL;
@@ -2675,8 +2635,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2675 2635
2676retry: 2636retry:
2677 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2637 if (!(gfp_mask & __GFP_NO_KSWAPD))
2678 wake_all_kswapds(order, zonelist, high_zoneidx, 2638 wake_all_kswapds(order, ac);
2679 preferred_zone, nodemask);
2680 2639
2681 /* 2640 /*
2682 * OK, we're below the kswapd watermark and have kicked background 2641 * OK, we're below the kswapd watermark and have kicked background
@@ -2689,17 +2648,16 @@ retry:
2689 * Find the true preferred zone if the allocation is unconstrained by 2648 * Find the true preferred zone if the allocation is unconstrained by
2690 * cpusets. 2649 * cpusets.
2691 */ 2650 */
2692 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2651 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
2693 struct zoneref *preferred_zoneref; 2652 struct zoneref *preferred_zoneref;
2694 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2653 preferred_zoneref = first_zones_zonelist(ac->zonelist,
2695 NULL, &preferred_zone); 2654 ac->high_zoneidx, NULL, &ac->preferred_zone);
2696 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2655 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
2697 } 2656 }
2698 2657
2699 /* This is the last chance, in general, before the goto nopage. */ 2658 /* This is the last chance, in general, before the goto nopage. */
2700 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2659 page = get_page_from_freelist(gfp_mask, order,
2701 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2660 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2702 preferred_zone, classzone_idx, migratetype);
2703 if (page) 2661 if (page)
2704 goto got_pg; 2662 goto got_pg;
2705 2663
@@ -2710,11 +2668,10 @@ retry:
2710 * the allocation is high priority and these type of 2668 * the allocation is high priority and these type of
2711 * allocations are system rather than user orientated 2669 * allocations are system rather than user orientated
2712 */ 2670 */
2713 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2671 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
2672
2673 page = __alloc_pages_high_priority(gfp_mask, order, ac);
2714 2674
2715 page = __alloc_pages_high_priority(gfp_mask, order,
2716 zonelist, high_zoneidx, nodemask,
2717 preferred_zone, classzone_idx, migratetype);
2718 if (page) { 2675 if (page) {
2719 goto got_pg; 2676 goto got_pg;
2720 } 2677 }
@@ -2743,11 +2700,9 @@ retry:
2743 * Try direct compaction. The first pass is asynchronous. Subsequent 2700 * Try direct compaction. The first pass is asynchronous. Subsequent
2744 * attempts after direct reclaim are synchronous 2701 * attempts after direct reclaim are synchronous
2745 */ 2702 */
2746 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2703 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
2747 high_zoneidx, nodemask, alloc_flags, 2704 migration_mode,
2748 preferred_zone, 2705 &contended_compaction,
2749 classzone_idx, migratetype,
2750 migration_mode, &contended_compaction,
2751 &deferred_compaction); 2706 &deferred_compaction);
2752 if (page) 2707 if (page)
2753 goto got_pg; 2708 goto got_pg;
@@ -2793,12 +2748,8 @@ retry:
2793 migration_mode = MIGRATE_SYNC_LIGHT; 2748 migration_mode = MIGRATE_SYNC_LIGHT;
2794 2749
2795 /* Try direct reclaim and then allocating */ 2750 /* Try direct reclaim and then allocating */
2796 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2751 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
2797 zonelist, high_zoneidx, 2752 &did_some_progress);
2798 nodemask,
2799 alloc_flags, preferred_zone,
2800 classzone_idx, migratetype,
2801 &did_some_progress);
2802 if (page) 2753 if (page)
2803 goto got_pg; 2754 goto got_pg;
2804 2755
@@ -2812,17 +2763,15 @@ retry:
2812 * start OOM killing tasks. 2763 * start OOM killing tasks.
2813 */ 2764 */
2814 if (!did_some_progress) { 2765 if (!did_some_progress) {
2815 page = __alloc_pages_may_oom(gfp_mask, order, zonelist, 2766 page = __alloc_pages_may_oom(gfp_mask, order, ac,
2816 high_zoneidx, nodemask, 2767 &did_some_progress);
2817 preferred_zone, classzone_idx,
2818 migratetype,&did_some_progress);
2819 if (page) 2768 if (page)
2820 goto got_pg; 2769 goto got_pg;
2821 if (!did_some_progress) 2770 if (!did_some_progress)
2822 goto nopage; 2771 goto nopage;
2823 } 2772 }
2824 /* Wait for some write requests to complete then retry */ 2773 /* Wait for some write requests to complete then retry */
2825 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2774 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
2826 goto retry; 2775 goto retry;
2827 } else { 2776 } else {
2828 /* 2777 /*
@@ -2830,11 +2779,9 @@ retry:
2830 * direct reclaim and reclaim/compaction depends on compaction 2779 * direct reclaim and reclaim/compaction depends on compaction
2831 * being called after reclaim so call directly if necessary 2780 * being called after reclaim so call directly if necessary
2832 */ 2781 */
2833 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2782 page = __alloc_pages_direct_compact(gfp_mask, order,
2834 high_zoneidx, nodemask, alloc_flags, 2783 alloc_flags, ac, migration_mode,
2835 preferred_zone, 2784 &contended_compaction,
2836 classzone_idx, migratetype,
2837 migration_mode, &contended_compaction,
2838 &deferred_compaction); 2785 &deferred_compaction);
2839 if (page) 2786 if (page)
2840 goto got_pg; 2787 goto got_pg;
@@ -2842,11 +2789,7 @@ retry:
2842 2789
2843nopage: 2790nopage:
2844 warn_alloc_failed(gfp_mask, order, NULL); 2791 warn_alloc_failed(gfp_mask, order, NULL);
2845 return page;
2846got_pg: 2792got_pg:
2847 if (kmemcheck_enabled)
2848 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2849
2850 return page; 2793 return page;
2851} 2794}
2852 2795
@@ -2857,14 +2800,16 @@ struct page *
2857__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2800__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2858 struct zonelist *zonelist, nodemask_t *nodemask) 2801 struct zonelist *zonelist, nodemask_t *nodemask)
2859{ 2802{
2860 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2861 struct zone *preferred_zone;
2862 struct zoneref *preferred_zoneref; 2803 struct zoneref *preferred_zoneref;
2863 struct page *page = NULL; 2804 struct page *page = NULL;
2864 int migratetype = gfpflags_to_migratetype(gfp_mask);
2865 unsigned int cpuset_mems_cookie; 2805 unsigned int cpuset_mems_cookie;
2866 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2806 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2867 int classzone_idx; 2807 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
2808 struct alloc_context ac = {
2809 .high_zoneidx = gfp_zone(gfp_mask),
2810 .nodemask = nodemask,
2811 .migratetype = gfpflags_to_migratetype(gfp_mask),
2812 };
2868 2813
2869 gfp_mask &= gfp_allowed_mask; 2814 gfp_mask &= gfp_allowed_mask;
2870 2815
@@ -2883,37 +2828,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2883 if (unlikely(!zonelist->_zonerefs->zone)) 2828 if (unlikely(!zonelist->_zonerefs->zone))
2884 return NULL; 2829 return NULL;
2885 2830
2886 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) 2831 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
2887 alloc_flags |= ALLOC_CMA; 2832 alloc_flags |= ALLOC_CMA;
2888 2833
2889retry_cpuset: 2834retry_cpuset:
2890 cpuset_mems_cookie = read_mems_allowed_begin(); 2835 cpuset_mems_cookie = read_mems_allowed_begin();
2891 2836
2837 /* We set it here, as __alloc_pages_slowpath might have changed it */
2838 ac.zonelist = zonelist;
2892 /* The preferred zone is used for statistics later */ 2839 /* The preferred zone is used for statistics later */
2893 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2840 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
2894 nodemask ? : &cpuset_current_mems_allowed, 2841 ac.nodemask ? : &cpuset_current_mems_allowed,
2895 &preferred_zone); 2842 &ac.preferred_zone);
2896 if (!preferred_zone) 2843 if (!ac.preferred_zone)
2897 goto out; 2844 goto out;
2898 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2845 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
2899 2846
2900 /* First allocation attempt */ 2847 /* First allocation attempt */
2901 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2848 alloc_mask = gfp_mask|__GFP_HARDWALL;
2902 zonelist, high_zoneidx, alloc_flags, 2849 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
2903 preferred_zone, classzone_idx, migratetype);
2904 if (unlikely(!page)) { 2850 if (unlikely(!page)) {
2905 /* 2851 /*
2906 * Runtime PM, block IO and its error handling path 2852 * Runtime PM, block IO and its error handling path
2907 * can deadlock because I/O on the device might not 2853 * can deadlock because I/O on the device might not
2908 * complete. 2854 * complete.
2909 */ 2855 */
2910 gfp_mask = memalloc_noio_flags(gfp_mask); 2856 alloc_mask = memalloc_noio_flags(gfp_mask);
2911 page = __alloc_pages_slowpath(gfp_mask, order, 2857
2912 zonelist, high_zoneidx, nodemask, 2858 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
2913 preferred_zone, classzone_idx, migratetype);
2914 } 2859 }
2915 2860
2916 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2861 if (kmemcheck_enabled && page)
2862 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2863
2864 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
2917 2865
2918out: 2866out:
2919 /* 2867 /*
@@ -3933,18 +3881,29 @@ static int __build_all_zonelists(void *data)
3933 return 0; 3881 return 0;
3934} 3882}
3935 3883
3884static noinline void __init
3885build_all_zonelists_init(void)
3886{
3887 __build_all_zonelists(NULL);
3888 mminit_verify_zonelist();
3889 cpuset_init_current_mems_allowed();
3890}
3891
3936/* 3892/*
3937 * Called with zonelists_mutex held always 3893 * Called with zonelists_mutex held always
3938 * unless system_state == SYSTEM_BOOTING. 3894 * unless system_state == SYSTEM_BOOTING.
3895 *
3896 * __ref due to (1) call of __meminit annotated setup_zone_pageset
3897 * [we're only called with non-NULL zone through __meminit paths] and
3898 * (2) call of __init annotated helper build_all_zonelists_init
3899 * [protected by SYSTEM_BOOTING].
3939 */ 3900 */
3940void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3901void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3941{ 3902{
3942 set_zonelist_order(); 3903 set_zonelist_order();
3943 3904
3944 if (system_state == SYSTEM_BOOTING) { 3905 if (system_state == SYSTEM_BOOTING) {
3945 __build_all_zonelists(NULL); 3906 build_all_zonelists_init();
3946 mminit_verify_zonelist();
3947 cpuset_init_current_mems_allowed();
3948 } else { 3907 } else {
3949#ifdef CONFIG_MEMORY_HOTPLUG 3908#ifdef CONFIG_MEMORY_HOTPLUG
3950 if (zone) 3909 if (zone)
@@ -5047,8 +5006,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5047 pgdat->node_start_pfn = node_start_pfn; 5006 pgdat->node_start_pfn = node_start_pfn;
5048#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5007#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5049 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5008 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5050 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, 5009 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5051 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); 5010 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
5052#endif 5011#endif
5053 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5012 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5054 zones_size, zholes_size); 5013 zones_size, zholes_size);
@@ -5420,9 +5379,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5420 arch_zone_highest_possible_pfn[i]) 5379 arch_zone_highest_possible_pfn[i])
5421 pr_cont("empty\n"); 5380 pr_cont("empty\n");
5422 else 5381 else
5423 pr_cont("[mem %0#10lx-%0#10lx]\n", 5382 pr_cont("[mem %#018Lx-%#018Lx]\n",
5424 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5383 (u64)arch_zone_lowest_possible_pfn[i]
5425 (arch_zone_highest_possible_pfn[i] 5384 << PAGE_SHIFT,
5385 ((u64)arch_zone_highest_possible_pfn[i]
5426 << PAGE_SHIFT) - 1); 5386 << PAGE_SHIFT) - 1);
5427 } 5387 }
5428 5388
@@ -5430,15 +5390,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5430 pr_info("Movable zone start for each node\n"); 5390 pr_info("Movable zone start for each node\n");
5431 for (i = 0; i < MAX_NUMNODES; i++) { 5391 for (i = 0; i < MAX_NUMNODES; i++) {
5432 if (zone_movable_pfn[i]) 5392 if (zone_movable_pfn[i])
5433 pr_info(" Node %d: %#010lx\n", i, 5393 pr_info(" Node %d: %#018Lx\n", i,
5434 zone_movable_pfn[i] << PAGE_SHIFT); 5394 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
5435 } 5395 }
5436 5396
5437 /* Print out the early node map */ 5397 /* Print out the early node map */
5438 pr_info("Early memory node ranges\n"); 5398 pr_info("Early memory node ranges\n");
5439 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5399 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5440 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5400 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
5441 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5401 (u64)start_pfn << PAGE_SHIFT,
5402 ((u64)end_pfn << PAGE_SHIFT) - 1);
5442 5403
5443 /* Initialise every node */ 5404 /* Initialise every node */
5444 mminit_verify_pageflags_layout(); 5405 mminit_verify_pageflags_layout();