diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 526 |
1 files changed, 234 insertions, 292 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7633c503a116..a47f0b229a1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
28 | #include <linux/kasan.h> | ||
28 | #include <linux/module.h> | 29 | #include <linux/module.h> |
29 | #include <linux/suspend.h> | 30 | #include <linux/suspend.h> |
30 | #include <linux/pagevec.h> | 31 | #include <linux/pagevec.h> |
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
172 | * 1G machine -> (16M dma, 784M normal, 224M high) | 173 | * 1G machine -> (16M dma, 784M normal, 224M high) |
173 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 174 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
174 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 175 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
175 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 176 | * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
176 | * | 177 | * |
177 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 178 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
178 | * don't need any ZONE_NORMAL reservation | 179 | * don't need any ZONE_NORMAL reservation |
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) | |||
244 | PB_migrate, PB_migrate_end); | 245 | PB_migrate, PB_migrate_end); |
245 | } | 246 | } |
246 | 247 | ||
247 | bool oom_killer_disabled __read_mostly; | ||
248 | |||
249 | #ifdef CONFIG_DEBUG_VM | 248 | #ifdef CONFIG_DEBUG_VM |
250 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 249 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
251 | { | 250 | { |
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
381 | } | 380 | } |
382 | } | 381 | } |
383 | 382 | ||
384 | /* update __split_huge_page_refcount if you change this function */ | ||
385 | static int destroy_compound_page(struct page *page, unsigned long order) | ||
386 | { | ||
387 | int i; | ||
388 | int nr_pages = 1 << order; | ||
389 | int bad = 0; | ||
390 | |||
391 | if (unlikely(compound_order(page) != order)) { | ||
392 | bad_page(page, "wrong compound order", 0); | ||
393 | bad++; | ||
394 | } | ||
395 | |||
396 | __ClearPageHead(page); | ||
397 | |||
398 | for (i = 1; i < nr_pages; i++) { | ||
399 | struct page *p = page + i; | ||
400 | |||
401 | if (unlikely(!PageTail(p))) { | ||
402 | bad_page(page, "PageTail not set", 0); | ||
403 | bad++; | ||
404 | } else if (unlikely(p->first_page != page)) { | ||
405 | bad_page(page, "first_page not consistent", 0); | ||
406 | bad++; | ||
407 | } | ||
408 | __ClearPageTail(p); | ||
409 | } | ||
410 | |||
411 | return bad; | ||
412 | } | ||
413 | |||
414 | static inline void prep_zero_page(struct page *page, unsigned int order, | 383 | static inline void prep_zero_page(struct page *page, unsigned int order, |
415 | gfp_t gfp_flags) | 384 | gfp_t gfp_flags) |
416 | { | 385 | { |
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
552 | return 0; | 521 | return 0; |
553 | 522 | ||
554 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 523 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
555 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
556 | |||
557 | if (page_zone_id(page) != page_zone_id(buddy)) | 524 | if (page_zone_id(page) != page_zone_id(buddy)) |
558 | return 0; | 525 | return 0; |
559 | 526 | ||
527 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
528 | |||
560 | return 1; | 529 | return 1; |
561 | } | 530 | } |
562 | 531 | ||
563 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 532 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
564 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
565 | |||
566 | /* | 533 | /* |
567 | * zone check is done late to avoid uselessly | 534 | * zone check is done late to avoid uselessly |
568 | * calculating zone/node ids for pages that could | 535 | * calculating zone/node ids for pages that could |
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
571 | if (page_zone_id(page) != page_zone_id(buddy)) | 538 | if (page_zone_id(page) != page_zone_id(buddy)) |
572 | return 0; | 539 | return 0; |
573 | 540 | ||
541 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
542 | |||
574 | return 1; | 543 | return 1; |
575 | } | 544 | } |
576 | return 0; | 545 | return 0; |
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page, | |||
613 | int max_order = MAX_ORDER; | 582 | int max_order = MAX_ORDER; |
614 | 583 | ||
615 | VM_BUG_ON(!zone_is_initialized(zone)); | 584 | VM_BUG_ON(!zone_is_initialized(zone)); |
616 | 585 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | |
617 | if (unlikely(PageCompound(page))) | ||
618 | if (unlikely(destroy_compound_page(page, order))) | ||
619 | return; | ||
620 | 586 | ||
621 | VM_BUG_ON(migratetype == -1); | 587 | VM_BUG_ON(migratetype == -1); |
622 | if (is_migrate_isolate(migratetype)) { | 588 | if (is_migrate_isolate(migratetype)) { |
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone, | |||
797 | spin_unlock(&zone->lock); | 763 | spin_unlock(&zone->lock); |
798 | } | 764 | } |
799 | 765 | ||
766 | static int free_tail_pages_check(struct page *head_page, struct page *page) | ||
767 | { | ||
768 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | ||
769 | return 0; | ||
770 | if (unlikely(!PageTail(page))) { | ||
771 | bad_page(page, "PageTail not set", 0); | ||
772 | return 1; | ||
773 | } | ||
774 | if (unlikely(page->first_page != head_page)) { | ||
775 | bad_page(page, "first_page not consistent", 0); | ||
776 | return 1; | ||
777 | } | ||
778 | return 0; | ||
779 | } | ||
780 | |||
800 | static bool free_pages_prepare(struct page *page, unsigned int order) | 781 | static bool free_pages_prepare(struct page *page, unsigned int order) |
801 | { | 782 | { |
802 | int i; | 783 | bool compound = PageCompound(page); |
803 | int bad = 0; | 784 | int i, bad = 0; |
804 | 785 | ||
805 | VM_BUG_ON_PAGE(PageTail(page), page); | 786 | VM_BUG_ON_PAGE(PageTail(page), page); |
806 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | 787 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
807 | 788 | ||
808 | trace_mm_page_free(page, order); | 789 | trace_mm_page_free(page, order); |
809 | kmemcheck_free_shadow(page, order); | 790 | kmemcheck_free_shadow(page, order); |
791 | kasan_free_pages(page, order); | ||
810 | 792 | ||
811 | if (PageAnon(page)) | 793 | if (PageAnon(page)) |
812 | page->mapping = NULL; | 794 | page->mapping = NULL; |
813 | for (i = 0; i < (1 << order); i++) | 795 | bad += free_pages_check(page); |
796 | for (i = 1; i < (1 << order); i++) { | ||
797 | if (compound) | ||
798 | bad += free_tail_pages_check(page, page + i); | ||
814 | bad += free_pages_check(page + i); | 799 | bad += free_pages_check(page + i); |
800 | } | ||
815 | if (bad) | 801 | if (bad) |
816 | return false; | 802 | return false; |
817 | 803 | ||
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page) | |||
970 | return 0; | 956 | return 0; |
971 | } | 957 | } |
972 | 958 | ||
973 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | 959 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
960 | int alloc_flags) | ||
974 | { | 961 | { |
975 | int i; | 962 | int i; |
976 | 963 | ||
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
985 | 972 | ||
986 | arch_alloc_page(page, order); | 973 | arch_alloc_page(page, order); |
987 | kernel_map_pages(page, 1 << order, 1); | 974 | kernel_map_pages(page, 1 << order, 1); |
975 | kasan_alloc_pages(page, order); | ||
988 | 976 | ||
989 | if (gfp_flags & __GFP_ZERO) | 977 | if (gfp_flags & __GFP_ZERO) |
990 | prep_zero_page(page, order, gfp_flags); | 978 | prep_zero_page(page, order, gfp_flags); |
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
994 | 982 | ||
995 | set_page_owner(page, order, gfp_flags); | 983 | set_page_owner(page, order, gfp_flags); |
996 | 984 | ||
985 | /* | ||
986 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to | ||
987 | * allocate the page. The expectation is that the caller is taking | ||
988 | * steps that will free more memory. The caller should avoid the page | ||
989 | * being used for !PFMEMALLOC purposes. | ||
990 | */ | ||
991 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
992 | |||
997 | return 0; | 993 | return 0; |
998 | } | 994 | } |
999 | 995 | ||
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
1130 | } | 1126 | } |
1131 | 1127 | ||
1132 | /* | 1128 | /* |
1133 | * If breaking a large block of pages, move all free pages to the preferred | 1129 | * When we are falling back to another migratetype during allocation, try to |
1134 | * allocation list. If falling back for a reclaimable kernel allocation, be | 1130 | * steal extra free pages from the same pageblocks to satisfy further |
1135 | * more aggressive about taking ownership of free pages. | 1131 | * allocations, instead of polluting multiple pageblocks. |
1136 | * | 1132 | * |
1137 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | 1133 | * If we are stealing a relatively large buddy page, it is likely there will |
1138 | * nor move CMA pages to different free lists. We don't want unmovable pages | 1134 | * be more free pages in the pageblock, so try to steal them all. For |
1139 | * to be allocated from MIGRATE_CMA areas. | 1135 | * reclaimable and unmovable allocations, we steal regardless of page size, |
1136 | * as fragmentation caused by those allocations polluting movable pageblocks | ||
1137 | * is worse than movable allocations stealing from unmovable and reclaimable | ||
1138 | * pageblocks. | ||
1140 | * | 1139 | * |
1141 | * Returns the new migratetype of the pageblock (or the same old migratetype | 1140 | * If we claim more than half of the pageblock, change pageblock's migratetype |
1142 | * if it was unchanged). | 1141 | * as well. |
1143 | */ | 1142 | */ |
1144 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | 1143 | static void try_to_steal_freepages(struct zone *zone, struct page *page, |
1145 | int start_type, int fallback_type) | 1144 | int start_type, int fallback_type) |
1146 | { | 1145 | { |
1147 | int current_order = page_order(page); | 1146 | int current_order = page_order(page); |
1148 | 1147 | ||
1149 | /* | ||
1150 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1151 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
1152 | * is set to CMA so it is returned to the correct freelist in case | ||
1153 | * the page ends up being not actually allocated from the pcp lists. | ||
1154 | */ | ||
1155 | if (is_migrate_cma(fallback_type)) | ||
1156 | return fallback_type; | ||
1157 | |||
1158 | /* Take ownership for orders >= pageblock_order */ | 1148 | /* Take ownership for orders >= pageblock_order */ |
1159 | if (current_order >= pageblock_order) { | 1149 | if (current_order >= pageblock_order) { |
1160 | change_pageblock_range(page, current_order, start_type); | 1150 | change_pageblock_range(page, current_order, start_type); |
1161 | return start_type; | 1151 | return; |
1162 | } | 1152 | } |
1163 | 1153 | ||
1164 | if (current_order >= pageblock_order / 2 || | 1154 | if (current_order >= pageblock_order / 2 || |
1165 | start_type == MIGRATE_RECLAIMABLE || | 1155 | start_type == MIGRATE_RECLAIMABLE || |
1156 | start_type == MIGRATE_UNMOVABLE || | ||
1166 | page_group_by_mobility_disabled) { | 1157 | page_group_by_mobility_disabled) { |
1167 | int pages; | 1158 | int pages; |
1168 | 1159 | ||
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1170 | 1161 | ||
1171 | /* Claim the whole block if over half of it is free */ | 1162 | /* Claim the whole block if over half of it is free */ |
1172 | if (pages >= (1 << (pageblock_order-1)) || | 1163 | if (pages >= (1 << (pageblock_order-1)) || |
1173 | page_group_by_mobility_disabled) { | 1164 | page_group_by_mobility_disabled) |
1174 | |||
1175 | set_pageblock_migratetype(page, start_type); | 1165 | set_pageblock_migratetype(page, start_type); |
1176 | return start_type; | ||
1177 | } | ||
1178 | |||
1179 | } | 1166 | } |
1180 | |||
1181 | return fallback_type; | ||
1182 | } | 1167 | } |
1183 | 1168 | ||
1184 | /* Remove an element from the buddy allocator from the fallback list */ | 1169 | /* Remove an element from the buddy allocator from the fallback list */ |
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1188 | struct free_area *area; | 1173 | struct free_area *area; |
1189 | unsigned int current_order; | 1174 | unsigned int current_order; |
1190 | struct page *page; | 1175 | struct page *page; |
1191 | int migratetype, new_type, i; | ||
1192 | 1176 | ||
1193 | /* Find the largest possible block of pages in the other list */ | 1177 | /* Find the largest possible block of pages in the other list */ |
1194 | for (current_order = MAX_ORDER-1; | 1178 | for (current_order = MAX_ORDER-1; |
1195 | current_order >= order && current_order <= MAX_ORDER-1; | 1179 | current_order >= order && current_order <= MAX_ORDER-1; |
1196 | --current_order) { | 1180 | --current_order) { |
1181 | int i; | ||
1197 | for (i = 0;; i++) { | 1182 | for (i = 0;; i++) { |
1198 | migratetype = fallbacks[start_migratetype][i]; | 1183 | int migratetype = fallbacks[start_migratetype][i]; |
1184 | int buddy_type = start_migratetype; | ||
1199 | 1185 | ||
1200 | /* MIGRATE_RESERVE handled later if necessary */ | 1186 | /* MIGRATE_RESERVE handled later if necessary */ |
1201 | if (migratetype == MIGRATE_RESERVE) | 1187 | if (migratetype == MIGRATE_RESERVE) |
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1209 | struct page, lru); | 1195 | struct page, lru); |
1210 | area->nr_free--; | 1196 | area->nr_free--; |
1211 | 1197 | ||
1212 | new_type = try_to_steal_freepages(zone, page, | 1198 | if (!is_migrate_cma(migratetype)) { |
1213 | start_migratetype, | 1199 | try_to_steal_freepages(zone, page, |
1214 | migratetype); | 1200 | start_migratetype, |
1201 | migratetype); | ||
1202 | } else { | ||
1203 | /* | ||
1204 | * When borrowing from MIGRATE_CMA, we need to | ||
1205 | * release the excess buddy pages to CMA | ||
1206 | * itself, and we do not try to steal extra | ||
1207 | * free pages. | ||
1208 | */ | ||
1209 | buddy_type = migratetype; | ||
1210 | } | ||
1215 | 1211 | ||
1216 | /* Remove the page from the freelists */ | 1212 | /* Remove the page from the freelists */ |
1217 | list_del(&page->lru); | 1213 | list_del(&page->lru); |
1218 | rmv_page_order(page); | 1214 | rmv_page_order(page); |
1219 | 1215 | ||
1220 | expand(zone, page, order, current_order, area, | 1216 | expand(zone, page, order, current_order, area, |
1221 | new_type); | 1217 | buddy_type); |
1222 | /* The freepage_migratetype may differ from pageblock's | 1218 | |
1219 | /* | ||
1220 | * The freepage_migratetype may differ from pageblock's | ||
1223 | * migratetype depending on the decisions in | 1221 | * migratetype depending on the decisions in |
1224 | * try_to_steal_freepages. This is OK as long as it does | 1222 | * try_to_steal_freepages(). This is OK as long as it |
1225 | * not differ for MIGRATE_CMA type. | 1223 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
1224 | * we need to make sure unallocated pages flushed from | ||
1225 | * pcp lists are returned to the correct freelist. | ||
1226 | */ | 1226 | */ |
1227 | set_freepage_migratetype(page, new_type); | 1227 | set_freepage_migratetype(page, buddy_type); |
1228 | 1228 | ||
1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1230 | start_migratetype, migratetype, new_type); | 1230 | start_migratetype, migratetype); |
1231 | 1231 | ||
1232 | return page; | 1232 | return page; |
1233 | } | 1233 | } |
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page) | |||
1642 | } | 1642 | } |
1643 | 1643 | ||
1644 | /* | 1644 | /* |
1645 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1645 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
1646 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
1647 | * or two. | ||
1648 | */ | 1646 | */ |
1649 | static inline | 1647 | static inline |
1650 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1648 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1655 | struct page *page; | 1653 | struct page *page; |
1656 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 1654 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1657 | 1655 | ||
1658 | again: | ||
1659 | if (likely(order == 0)) { | 1656 | if (likely(order == 0)) { |
1660 | struct per_cpu_pages *pcp; | 1657 | struct per_cpu_pages *pcp; |
1661 | struct list_head *list; | 1658 | struct list_head *list; |
@@ -1711,8 +1708,6 @@ again: | |||
1711 | local_irq_restore(flags); | 1708 | local_irq_restore(flags); |
1712 | 1709 | ||
1713 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 1710 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
1714 | if (prep_new_page(page, order, gfp_flags)) | ||
1715 | goto again; | ||
1716 | return page; | 1711 | return page; |
1717 | 1712 | ||
1718 | failed: | 1713 | failed: |
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
2033 | * a page. | 2028 | * a page. |
2034 | */ | 2029 | */ |
2035 | static struct page * | 2030 | static struct page * |
2036 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 2031 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
2037 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 2032 | const struct alloc_context *ac) |
2038 | struct zone *preferred_zone, int classzone_idx, int migratetype) | ||
2039 | { | 2033 | { |
2034 | struct zonelist *zonelist = ac->zonelist; | ||
2040 | struct zoneref *z; | 2035 | struct zoneref *z; |
2041 | struct page *page = NULL; | 2036 | struct page *page = NULL; |
2042 | struct zone *zone; | 2037 | struct zone *zone; |
@@ -2055,8 +2050,8 @@ zonelist_scan: | |||
2055 | * Scan zonelist, looking for a zone with enough free. | 2050 | * Scan zonelist, looking for a zone with enough free. |
2056 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2051 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
2057 | */ | 2052 | */ |
2058 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2053 | for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
2059 | high_zoneidx, nodemask) { | 2054 | ac->nodemask) { |
2060 | unsigned long mark; | 2055 | unsigned long mark; |
2061 | 2056 | ||
2062 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 2057 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
@@ -2073,7 +2068,7 @@ zonelist_scan: | |||
2073 | * time the page has in memory before being reclaimed. | 2068 | * time the page has in memory before being reclaimed. |
2074 | */ | 2069 | */ |
2075 | if (alloc_flags & ALLOC_FAIR) { | 2070 | if (alloc_flags & ALLOC_FAIR) { |
2076 | if (!zone_local(preferred_zone, zone)) | 2071 | if (!zone_local(ac->preferred_zone, zone)) |
2077 | break; | 2072 | break; |
2078 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | 2073 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
2079 | nr_fair_skipped++; | 2074 | nr_fair_skipped++; |
@@ -2111,7 +2106,7 @@ zonelist_scan: | |||
2111 | 2106 | ||
2112 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2107 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
2113 | if (!zone_watermark_ok(zone, order, mark, | 2108 | if (!zone_watermark_ok(zone, order, mark, |
2114 | classzone_idx, alloc_flags)) { | 2109 | ac->classzone_idx, alloc_flags)) { |
2115 | int ret; | 2110 | int ret; |
2116 | 2111 | ||
2117 | /* Checked here to keep the fast path fast */ | 2112 | /* Checked here to keep the fast path fast */ |
@@ -2132,7 +2127,7 @@ zonelist_scan: | |||
2132 | } | 2127 | } |
2133 | 2128 | ||
2134 | if (zone_reclaim_mode == 0 || | 2129 | if (zone_reclaim_mode == 0 || |
2135 | !zone_allows_reclaim(preferred_zone, zone)) | 2130 | !zone_allows_reclaim(ac->preferred_zone, zone)) |
2136 | goto this_zone_full; | 2131 | goto this_zone_full; |
2137 | 2132 | ||
2138 | /* | 2133 | /* |
@@ -2154,7 +2149,7 @@ zonelist_scan: | |||
2154 | default: | 2149 | default: |
2155 | /* did we reclaim enough */ | 2150 | /* did we reclaim enough */ |
2156 | if (zone_watermark_ok(zone, order, mark, | 2151 | if (zone_watermark_ok(zone, order, mark, |
2157 | classzone_idx, alloc_flags)) | 2152 | ac->classzone_idx, alloc_flags)) |
2158 | goto try_this_zone; | 2153 | goto try_this_zone; |
2159 | 2154 | ||
2160 | /* | 2155 | /* |
@@ -2175,27 +2170,18 @@ zonelist_scan: | |||
2175 | } | 2170 | } |
2176 | 2171 | ||
2177 | try_this_zone: | 2172 | try_this_zone: |
2178 | page = buffered_rmqueue(preferred_zone, zone, order, | 2173 | page = buffered_rmqueue(ac->preferred_zone, zone, order, |
2179 | gfp_mask, migratetype); | 2174 | gfp_mask, ac->migratetype); |
2180 | if (page) | 2175 | if (page) { |
2181 | break; | 2176 | if (prep_new_page(page, order, gfp_mask, alloc_flags)) |
2177 | goto try_this_zone; | ||
2178 | return page; | ||
2179 | } | ||
2182 | this_zone_full: | 2180 | this_zone_full: |
2183 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) | 2181 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2184 | zlc_mark_zone_full(zonelist, z); | 2182 | zlc_mark_zone_full(zonelist, z); |
2185 | } | 2183 | } |
2186 | 2184 | ||
2187 | if (page) { | ||
2188 | /* | ||
2189 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2190 | * necessary to allocate the page. The expectation is | ||
2191 | * that the caller is taking steps that will free more | ||
2192 | * memory. The caller should avoid the page being used | ||
2193 | * for !PFMEMALLOC purposes. | ||
2194 | */ | ||
2195 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
2196 | return page; | ||
2197 | } | ||
2198 | |||
2199 | /* | 2185 | /* |
2200 | * The first pass makes sure allocations are spread fairly within the | 2186 | * The first pass makes sure allocations are spread fairly within the |
2201 | * local node. However, the local node might have free pages left | 2187 | * local node. However, the local node might have free pages left |
@@ -2208,7 +2194,7 @@ this_zone_full: | |||
2208 | alloc_flags &= ~ALLOC_FAIR; | 2194 | alloc_flags &= ~ALLOC_FAIR; |
2209 | if (nr_fair_skipped) { | 2195 | if (nr_fair_skipped) { |
2210 | zonelist_rescan = true; | 2196 | zonelist_rescan = true; |
2211 | reset_alloc_batches(preferred_zone); | 2197 | reset_alloc_batches(ac->preferred_zone); |
2212 | } | 2198 | } |
2213 | if (nr_online_nodes > 1) | 2199 | if (nr_online_nodes > 1) |
2214 | zonelist_rescan = true; | 2200 | zonelist_rescan = true; |
@@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
2330 | 2316 | ||
2331 | static inline struct page * | 2317 | static inline struct page * |
2332 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2318 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2333 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2319 | const struct alloc_context *ac, unsigned long *did_some_progress) |
2334 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2335 | int classzone_idx, int migratetype) | ||
2336 | { | 2320 | { |
2337 | struct page *page; | 2321 | struct page *page; |
2338 | 2322 | ||
2339 | /* Acquire the per-zone oom lock for each zone */ | 2323 | *did_some_progress = 0; |
2340 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { | ||
2341 | schedule_timeout_uninterruptible(1); | ||
2342 | return NULL; | ||
2343 | } | ||
2344 | 2324 | ||
2345 | /* | 2325 | /* |
2346 | * PM-freezer should be notified that there might be an OOM killer on | 2326 | * Acquire the per-zone oom lock for each zone. If that |
2347 | * its way to kill and wake somebody up. This is too early and we might | 2327 | * fails, somebody else is making progress for us. |
2348 | * end up not killing anything but false positives are acceptable. | ||
2349 | * See freeze_processes. | ||
2350 | */ | 2328 | */ |
2351 | note_oom_kill(); | 2329 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { |
2330 | *did_some_progress = 1; | ||
2331 | schedule_timeout_uninterruptible(1); | ||
2332 | return NULL; | ||
2333 | } | ||
2352 | 2334 | ||
2353 | /* | 2335 | /* |
2354 | * Go through the zonelist yet one more time, keep very high watermark | 2336 | * Go through the zonelist yet one more time, keep very high watermark |
2355 | * here, this is only to catch a parallel oom killing, we must fail if | 2337 | * here, this is only to catch a parallel oom killing, we must fail if |
2356 | * we're still under heavy pressure. | 2338 | * we're still under heavy pressure. |
2357 | */ | 2339 | */ |
2358 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2340 | page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, |
2359 | order, zonelist, high_zoneidx, | 2341 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); |
2360 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
2361 | preferred_zone, classzone_idx, migratetype); | ||
2362 | if (page) | 2342 | if (page) |
2363 | goto out; | 2343 | goto out; |
2364 | 2344 | ||
2365 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2345 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2346 | /* Coredumps can quickly deplete all memory reserves */ | ||
2347 | if (current->flags & PF_DUMPCORE) | ||
2348 | goto out; | ||
2366 | /* The OOM killer will not help higher order allocs */ | 2349 | /* The OOM killer will not help higher order allocs */ |
2367 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2350 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2368 | goto out; | 2351 | goto out; |
2369 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2352 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2370 | if (high_zoneidx < ZONE_NORMAL) | 2353 | if (ac->high_zoneidx < ZONE_NORMAL) |
2354 | goto out; | ||
2355 | /* The OOM killer does not compensate for light reclaim */ | ||
2356 | if (!(gfp_mask & __GFP_FS)) | ||
2371 | goto out; | 2357 | goto out; |
2372 | /* | 2358 | /* |
2373 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2359 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
@@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2380 | goto out; | 2366 | goto out; |
2381 | } | 2367 | } |
2382 | /* Exhausted what can be done so it's blamo time */ | 2368 | /* Exhausted what can be done so it's blamo time */ |
2383 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2369 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) |
2384 | 2370 | *did_some_progress = 1; | |
2385 | out: | 2371 | out: |
2386 | oom_zonelist_unlock(zonelist, gfp_mask); | 2372 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
2387 | return page; | 2373 | return page; |
2388 | } | 2374 | } |
2389 | 2375 | ||
@@ -2391,10 +2377,9 @@ out: | |||
2391 | /* Try memory compaction for high-order allocations before reclaim */ | 2377 | /* Try memory compaction for high-order allocations before reclaim */ |
2392 | static struct page * | 2378 | static struct page * |
2393 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2379 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2394 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2380 | int alloc_flags, const struct alloc_context *ac, |
2395 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2381 | enum migrate_mode mode, int *contended_compaction, |
2396 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2382 | bool *deferred_compaction) |
2397 | int *contended_compaction, bool *deferred_compaction) | ||
2398 | { | 2383 | { |
2399 | unsigned long compact_result; | 2384 | unsigned long compact_result; |
2400 | struct page *page; | 2385 | struct page *page; |
@@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2403 | return NULL; | 2388 | return NULL; |
2404 | 2389 | ||
2405 | current->flags |= PF_MEMALLOC; | 2390 | current->flags |= PF_MEMALLOC; |
2406 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2391 | compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
2407 | nodemask, mode, | 2392 | mode, contended_compaction); |
2408 | contended_compaction, | ||
2409 | alloc_flags, classzone_idx); | ||
2410 | current->flags &= ~PF_MEMALLOC; | 2393 | current->flags &= ~PF_MEMALLOC; |
2411 | 2394 | ||
2412 | switch (compact_result) { | 2395 | switch (compact_result) { |
@@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2425 | */ | 2408 | */ |
2426 | count_vm_event(COMPACTSTALL); | 2409 | count_vm_event(COMPACTSTALL); |
2427 | 2410 | ||
2428 | page = get_page_from_freelist(gfp_mask, nodemask, | 2411 | page = get_page_from_freelist(gfp_mask, order, |
2429 | order, zonelist, high_zoneidx, | 2412 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2430 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2431 | preferred_zone, classzone_idx, migratetype); | ||
2432 | 2413 | ||
2433 | if (page) { | 2414 | if (page) { |
2434 | struct zone *zone = page_zone(page); | 2415 | struct zone *zone = page_zone(page); |
@@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2452 | #else | 2433 | #else |
2453 | static inline struct page * | 2434 | static inline struct page * |
2454 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2435 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2455 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2436 | int alloc_flags, const struct alloc_context *ac, |
2456 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2437 | enum migrate_mode mode, int *contended_compaction, |
2457 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2438 | bool *deferred_compaction) |
2458 | int *contended_compaction, bool *deferred_compaction) | ||
2459 | { | 2439 | { |
2460 | return NULL; | 2440 | return NULL; |
2461 | } | 2441 | } |
@@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2463 | 2443 | ||
2464 | /* Perform direct synchronous page reclaim */ | 2444 | /* Perform direct synchronous page reclaim */ |
2465 | static int | 2445 | static int |
2466 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2446 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
2467 | nodemask_t *nodemask) | 2447 | const struct alloc_context *ac) |
2468 | { | 2448 | { |
2469 | struct reclaim_state reclaim_state; | 2449 | struct reclaim_state reclaim_state; |
2470 | int progress; | 2450 | int progress; |
@@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2478 | reclaim_state.reclaimed_slab = 0; | 2458 | reclaim_state.reclaimed_slab = 0; |
2479 | current->reclaim_state = &reclaim_state; | 2459 | current->reclaim_state = &reclaim_state; |
2480 | 2460 | ||
2481 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2461 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
2462 | ac->nodemask); | ||
2482 | 2463 | ||
2483 | current->reclaim_state = NULL; | 2464 | current->reclaim_state = NULL; |
2484 | lockdep_clear_current_reclaim_state(); | 2465 | lockdep_clear_current_reclaim_state(); |
@@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2492 | /* The really slow allocator path where we enter direct reclaim */ | 2473 | /* The really slow allocator path where we enter direct reclaim */ |
2493 | static inline struct page * | 2474 | static inline struct page * |
2494 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2475 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2495 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2476 | int alloc_flags, const struct alloc_context *ac, |
2496 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2477 | unsigned long *did_some_progress) |
2497 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2498 | { | 2478 | { |
2499 | struct page *page = NULL; | 2479 | struct page *page = NULL; |
2500 | bool drained = false; | 2480 | bool drained = false; |
2501 | 2481 | ||
2502 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2482 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
2503 | nodemask); | ||
2504 | if (unlikely(!(*did_some_progress))) | 2483 | if (unlikely(!(*did_some_progress))) |
2505 | return NULL; | 2484 | return NULL; |
2506 | 2485 | ||
2507 | /* After successful reclaim, reconsider all zones for allocation */ | 2486 | /* After successful reclaim, reconsider all zones for allocation */ |
2508 | if (IS_ENABLED(CONFIG_NUMA)) | 2487 | if (IS_ENABLED(CONFIG_NUMA)) |
2509 | zlc_clear_zones_full(zonelist); | 2488 | zlc_clear_zones_full(ac->zonelist); |
2510 | 2489 | ||
2511 | retry: | 2490 | retry: |
2512 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2491 | page = get_page_from_freelist(gfp_mask, order, |
2513 | zonelist, high_zoneidx, | 2492 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2514 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2515 | preferred_zone, classzone_idx, | ||
2516 | migratetype); | ||
2517 | 2493 | ||
2518 | /* | 2494 | /* |
2519 | * If an allocation failed after direct reclaim, it could be because | 2495 | * If an allocation failed after direct reclaim, it could be because |
@@ -2534,36 +2510,30 @@ retry: | |||
2534 | */ | 2510 | */ |
2535 | static inline struct page * | 2511 | static inline struct page * |
2536 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2512 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2537 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2513 | const struct alloc_context *ac) |
2538 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2539 | int classzone_idx, int migratetype) | ||
2540 | { | 2514 | { |
2541 | struct page *page; | 2515 | struct page *page; |
2542 | 2516 | ||
2543 | do { | 2517 | do { |
2544 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2518 | page = get_page_from_freelist(gfp_mask, order, |
2545 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2519 | ALLOC_NO_WATERMARKS, ac); |
2546 | preferred_zone, classzone_idx, migratetype); | ||
2547 | 2520 | ||
2548 | if (!page && gfp_mask & __GFP_NOFAIL) | 2521 | if (!page && gfp_mask & __GFP_NOFAIL) |
2549 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2522 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, |
2523 | HZ/50); | ||
2550 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2524 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2551 | 2525 | ||
2552 | return page; | 2526 | return page; |
2553 | } | 2527 | } |
2554 | 2528 | ||
2555 | static void wake_all_kswapds(unsigned int order, | 2529 | static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) |
2556 | struct zonelist *zonelist, | ||
2557 | enum zone_type high_zoneidx, | ||
2558 | struct zone *preferred_zone, | ||
2559 | nodemask_t *nodemask) | ||
2560 | { | 2530 | { |
2561 | struct zoneref *z; | 2531 | struct zoneref *z; |
2562 | struct zone *zone; | 2532 | struct zone *zone; |
2563 | 2533 | ||
2564 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2534 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
2565 | high_zoneidx, nodemask) | 2535 | ac->high_zoneidx, ac->nodemask) |
2566 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2536 | wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); |
2567 | } | 2537 | } |
2568 | 2538 | ||
2569 | static inline int | 2539 | static inline int |
@@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
2622 | 2592 | ||
2623 | static inline struct page * | 2593 | static inline struct page * |
2624 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2594 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2625 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2595 | struct alloc_context *ac) |
2626 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2627 | int classzone_idx, int migratetype) | ||
2628 | { | 2596 | { |
2629 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2597 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2630 | struct page *page = NULL; | 2598 | struct page *page = NULL; |
@@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2658 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2626 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
2659 | goto nopage; | 2627 | goto nopage; |
2660 | 2628 | ||
2661 | restart: | 2629 | retry: |
2662 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2630 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2663 | wake_all_kswapds(order, zonelist, high_zoneidx, | 2631 | wake_all_kswapds(order, ac); |
2664 | preferred_zone, nodemask); | ||
2665 | 2632 | ||
2666 | /* | 2633 | /* |
2667 | * OK, we're below the kswapd watermark and have kicked background | 2634 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2674,18 +2641,16 @@ restart: | |||
2674 | * Find the true preferred zone if the allocation is unconstrained by | 2641 | * Find the true preferred zone if the allocation is unconstrained by |
2675 | * cpusets. | 2642 | * cpusets. |
2676 | */ | 2643 | */ |
2677 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { | 2644 | if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { |
2678 | struct zoneref *preferred_zoneref; | 2645 | struct zoneref *preferred_zoneref; |
2679 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2646 | preferred_zoneref = first_zones_zonelist(ac->zonelist, |
2680 | NULL, &preferred_zone); | 2647 | ac->high_zoneidx, NULL, &ac->preferred_zone); |
2681 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2648 | ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2682 | } | 2649 | } |
2683 | 2650 | ||
2684 | rebalance: | ||
2685 | /* This is the last chance, in general, before the goto nopage. */ | 2651 | /* This is the last chance, in general, before the goto nopage. */ |
2686 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2652 | page = get_page_from_freelist(gfp_mask, order, |
2687 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2653 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2688 | preferred_zone, classzone_idx, migratetype); | ||
2689 | if (page) | 2654 | if (page) |
2690 | goto got_pg; | 2655 | goto got_pg; |
2691 | 2656 | ||
@@ -2696,11 +2661,10 @@ rebalance: | |||
2696 | * the allocation is high priority and these type of | 2661 | * the allocation is high priority and these type of |
2697 | * allocations are system rather than user orientated | 2662 | * allocations are system rather than user orientated |
2698 | */ | 2663 | */ |
2699 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | 2664 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
2665 | |||
2666 | page = __alloc_pages_high_priority(gfp_mask, order, ac); | ||
2700 | 2667 | ||
2701 | page = __alloc_pages_high_priority(gfp_mask, order, | ||
2702 | zonelist, high_zoneidx, nodemask, | ||
2703 | preferred_zone, classzone_idx, migratetype); | ||
2704 | if (page) { | 2668 | if (page) { |
2705 | goto got_pg; | 2669 | goto got_pg; |
2706 | } | 2670 | } |
@@ -2729,11 +2693,9 @@ rebalance: | |||
2729 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2693 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2730 | * attempts after direct reclaim are synchronous | 2694 | * attempts after direct reclaim are synchronous |
2731 | */ | 2695 | */ |
2732 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2696 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, |
2733 | high_zoneidx, nodemask, alloc_flags, | 2697 | migration_mode, |
2734 | preferred_zone, | 2698 | &contended_compaction, |
2735 | classzone_idx, migratetype, | ||
2736 | migration_mode, &contended_compaction, | ||
2737 | &deferred_compaction); | 2699 | &deferred_compaction); |
2738 | if (page) | 2700 | if (page) |
2739 | goto got_pg; | 2701 | goto got_pg; |
@@ -2779,74 +2741,40 @@ rebalance: | |||
2779 | migration_mode = MIGRATE_SYNC_LIGHT; | 2741 | migration_mode = MIGRATE_SYNC_LIGHT; |
2780 | 2742 | ||
2781 | /* Try direct reclaim and then allocating */ | 2743 | /* Try direct reclaim and then allocating */ |
2782 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2744 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
2783 | zonelist, high_zoneidx, | 2745 | &did_some_progress); |
2784 | nodemask, | ||
2785 | alloc_flags, preferred_zone, | ||
2786 | classzone_idx, migratetype, | ||
2787 | &did_some_progress); | ||
2788 | if (page) | 2746 | if (page) |
2789 | goto got_pg; | 2747 | goto got_pg; |
2790 | 2748 | ||
2791 | /* | ||
2792 | * If we failed to make any progress reclaiming, then we are | ||
2793 | * running out of options and have to consider going OOM | ||
2794 | */ | ||
2795 | if (!did_some_progress) { | ||
2796 | if (oom_gfp_allowed(gfp_mask)) { | ||
2797 | if (oom_killer_disabled) | ||
2798 | goto nopage; | ||
2799 | /* Coredumps can quickly deplete all memory reserves */ | ||
2800 | if ((current->flags & PF_DUMPCORE) && | ||
2801 | !(gfp_mask & __GFP_NOFAIL)) | ||
2802 | goto nopage; | ||
2803 | page = __alloc_pages_may_oom(gfp_mask, order, | ||
2804 | zonelist, high_zoneidx, | ||
2805 | nodemask, preferred_zone, | ||
2806 | classzone_idx, migratetype); | ||
2807 | if (page) | ||
2808 | goto got_pg; | ||
2809 | |||
2810 | if (!(gfp_mask & __GFP_NOFAIL)) { | ||
2811 | /* | ||
2812 | * The oom killer is not called for high-order | ||
2813 | * allocations that may fail, so if no progress | ||
2814 | * is being made, there are no other options and | ||
2815 | * retrying is unlikely to help. | ||
2816 | */ | ||
2817 | if (order > PAGE_ALLOC_COSTLY_ORDER) | ||
2818 | goto nopage; | ||
2819 | /* | ||
2820 | * The oom killer is not called for lowmem | ||
2821 | * allocations to prevent needlessly killing | ||
2822 | * innocent tasks. | ||
2823 | */ | ||
2824 | if (high_zoneidx < ZONE_NORMAL) | ||
2825 | goto nopage; | ||
2826 | } | ||
2827 | |||
2828 | goto restart; | ||
2829 | } | ||
2830 | } | ||
2831 | |||
2832 | /* Check if we should retry the allocation */ | 2749 | /* Check if we should retry the allocation */ |
2833 | pages_reclaimed += did_some_progress; | 2750 | pages_reclaimed += did_some_progress; |
2834 | if (should_alloc_retry(gfp_mask, order, did_some_progress, | 2751 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2835 | pages_reclaimed)) { | 2752 | pages_reclaimed)) { |
2753 | /* | ||
2754 | * If we fail to make progress by freeing individual | ||
2755 | * pages, but the allocation wants us to keep going, | ||
2756 | * start OOM killing tasks. | ||
2757 | */ | ||
2758 | if (!did_some_progress) { | ||
2759 | page = __alloc_pages_may_oom(gfp_mask, order, ac, | ||
2760 | &did_some_progress); | ||
2761 | if (page) | ||
2762 | goto got_pg; | ||
2763 | if (!did_some_progress) | ||
2764 | goto nopage; | ||
2765 | } | ||
2836 | /* Wait for some write requests to complete then retry */ | 2766 | /* Wait for some write requests to complete then retry */ |
2837 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2767 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
2838 | goto rebalance; | 2768 | goto retry; |
2839 | } else { | 2769 | } else { |
2840 | /* | 2770 | /* |
2841 | * High-order allocations do not necessarily loop after | 2771 | * High-order allocations do not necessarily loop after |
2842 | * direct reclaim and reclaim/compaction depends on compaction | 2772 | * direct reclaim and reclaim/compaction depends on compaction |
2843 | * being called after reclaim so call directly if necessary | 2773 | * being called after reclaim so call directly if necessary |
2844 | */ | 2774 | */ |
2845 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2775 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2846 | high_zoneidx, nodemask, alloc_flags, | 2776 | alloc_flags, ac, migration_mode, |
2847 | preferred_zone, | 2777 | &contended_compaction, |
2848 | classzone_idx, migratetype, | ||
2849 | migration_mode, &contended_compaction, | ||
2850 | &deferred_compaction); | 2778 | &deferred_compaction); |
2851 | if (page) | 2779 | if (page) |
2852 | goto got_pg; | 2780 | goto got_pg; |
@@ -2854,11 +2782,7 @@ rebalance: | |||
2854 | 2782 | ||
2855 | nopage: | 2783 | nopage: |
2856 | warn_alloc_failed(gfp_mask, order, NULL); | 2784 | warn_alloc_failed(gfp_mask, order, NULL); |
2857 | return page; | ||
2858 | got_pg: | 2785 | got_pg: |
2859 | if (kmemcheck_enabled) | ||
2860 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2861 | |||
2862 | return page; | 2786 | return page; |
2863 | } | 2787 | } |
2864 | 2788 | ||
@@ -2869,14 +2793,16 @@ struct page * | |||
2869 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2793 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2870 | struct zonelist *zonelist, nodemask_t *nodemask) | 2794 | struct zonelist *zonelist, nodemask_t *nodemask) |
2871 | { | 2795 | { |
2872 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
2873 | struct zone *preferred_zone; | ||
2874 | struct zoneref *preferred_zoneref; | 2796 | struct zoneref *preferred_zoneref; |
2875 | struct page *page = NULL; | 2797 | struct page *page = NULL; |
2876 | int migratetype = gfpflags_to_migratetype(gfp_mask); | ||
2877 | unsigned int cpuset_mems_cookie; | 2798 | unsigned int cpuset_mems_cookie; |
2878 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2799 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2879 | int classzone_idx; | 2800 | gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ |
2801 | struct alloc_context ac = { | ||
2802 | .high_zoneidx = gfp_zone(gfp_mask), | ||
2803 | .nodemask = nodemask, | ||
2804 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
2805 | }; | ||
2880 | 2806 | ||
2881 | gfp_mask &= gfp_allowed_mask; | 2807 | gfp_mask &= gfp_allowed_mask; |
2882 | 2808 | ||
@@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2895 | if (unlikely(!zonelist->_zonerefs->zone)) | 2821 | if (unlikely(!zonelist->_zonerefs->zone)) |
2896 | return NULL; | 2822 | return NULL; |
2897 | 2823 | ||
2898 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | 2824 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
2899 | alloc_flags |= ALLOC_CMA; | 2825 | alloc_flags |= ALLOC_CMA; |
2900 | 2826 | ||
2901 | retry_cpuset: | 2827 | retry_cpuset: |
2902 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2828 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2903 | 2829 | ||
2830 | /* We set it here, as __alloc_pages_slowpath might have changed it */ | ||
2831 | ac.zonelist = zonelist; | ||
2904 | /* The preferred zone is used for statistics later */ | 2832 | /* The preferred zone is used for statistics later */ |
2905 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2833 | preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, |
2906 | nodemask ? : &cpuset_current_mems_allowed, | 2834 | ac.nodemask ? : &cpuset_current_mems_allowed, |
2907 | &preferred_zone); | 2835 | &ac.preferred_zone); |
2908 | if (!preferred_zone) | 2836 | if (!ac.preferred_zone) |
2909 | goto out; | 2837 | goto out; |
2910 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2838 | ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2911 | 2839 | ||
2912 | /* First allocation attempt */ | 2840 | /* First allocation attempt */ |
2913 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2841 | alloc_mask = gfp_mask|__GFP_HARDWALL; |
2914 | zonelist, high_zoneidx, alloc_flags, | 2842 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
2915 | preferred_zone, classzone_idx, migratetype); | ||
2916 | if (unlikely(!page)) { | 2843 | if (unlikely(!page)) { |
2917 | /* | 2844 | /* |
2918 | * Runtime PM, block IO and its error handling path | 2845 | * Runtime PM, block IO and its error handling path |
2919 | * can deadlock because I/O on the device might not | 2846 | * can deadlock because I/O on the device might not |
2920 | * complete. | 2847 | * complete. |
2921 | */ | 2848 | */ |
2922 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2849 | alloc_mask = memalloc_noio_flags(gfp_mask); |
2923 | page = __alloc_pages_slowpath(gfp_mask, order, | 2850 | |
2924 | zonelist, high_zoneidx, nodemask, | 2851 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
2925 | preferred_zone, classzone_idx, migratetype); | ||
2926 | } | 2852 | } |
2927 | 2853 | ||
2928 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2854 | if (kmemcheck_enabled && page) |
2855 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2856 | |||
2857 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | ||
2929 | 2858 | ||
2930 | out: | 2859 | out: |
2931 | /* | 2860 | /* |
@@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data) | |||
3945 | return 0; | 3874 | return 0; |
3946 | } | 3875 | } |
3947 | 3876 | ||
3877 | static noinline void __init | ||
3878 | build_all_zonelists_init(void) | ||
3879 | { | ||
3880 | __build_all_zonelists(NULL); | ||
3881 | mminit_verify_zonelist(); | ||
3882 | cpuset_init_current_mems_allowed(); | ||
3883 | } | ||
3884 | |||
3948 | /* | 3885 | /* |
3949 | * Called with zonelists_mutex held always | 3886 | * Called with zonelists_mutex held always |
3950 | * unless system_state == SYSTEM_BOOTING. | 3887 | * unless system_state == SYSTEM_BOOTING. |
3888 | * | ||
3889 | * __ref due to (1) call of __meminit annotated setup_zone_pageset | ||
3890 | * [we're only called with non-NULL zone through __meminit paths] and | ||
3891 | * (2) call of __init annotated helper build_all_zonelists_init | ||
3892 | * [protected by SYSTEM_BOOTING]. | ||
3951 | */ | 3893 | */ |
3952 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 3894 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3953 | { | 3895 | { |
3954 | set_zonelist_order(); | 3896 | set_zonelist_order(); |
3955 | 3897 | ||
3956 | if (system_state == SYSTEM_BOOTING) { | 3898 | if (system_state == SYSTEM_BOOTING) { |
3957 | __build_all_zonelists(NULL); | 3899 | build_all_zonelists_init(); |
3958 | mminit_verify_zonelist(); | ||
3959 | cpuset_init_current_mems_allowed(); | ||
3960 | } else { | 3900 | } else { |
3961 | #ifdef CONFIG_MEMORY_HOTPLUG | 3901 | #ifdef CONFIG_MEMORY_HOTPLUG |
3962 | if (zone) | 3902 | if (zone) |
@@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5059 | pgdat->node_start_pfn = node_start_pfn; | 4999 | pgdat->node_start_pfn = node_start_pfn; |
5060 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5000 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
5061 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5001 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
5062 | printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, | 5002 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5063 | (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); | 5003 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); |
5064 | #endif | 5004 | #endif |
5065 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5005 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5066 | zones_size, zholes_size); | 5006 | zones_size, zholes_size); |
@@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5432 | arch_zone_highest_possible_pfn[i]) | 5372 | arch_zone_highest_possible_pfn[i]) |
5433 | pr_cont("empty\n"); | 5373 | pr_cont("empty\n"); |
5434 | else | 5374 | else |
5435 | pr_cont("[mem %0#10lx-%0#10lx]\n", | 5375 | pr_cont("[mem %#018Lx-%#018Lx]\n", |
5436 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5376 | (u64)arch_zone_lowest_possible_pfn[i] |
5437 | (arch_zone_highest_possible_pfn[i] | 5377 | << PAGE_SHIFT, |
5378 | ((u64)arch_zone_highest_possible_pfn[i] | ||
5438 | << PAGE_SHIFT) - 1); | 5379 | << PAGE_SHIFT) - 1); |
5439 | } | 5380 | } |
5440 | 5381 | ||
@@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5442 | pr_info("Movable zone start for each node\n"); | 5383 | pr_info("Movable zone start for each node\n"); |
5443 | for (i = 0; i < MAX_NUMNODES; i++) { | 5384 | for (i = 0; i < MAX_NUMNODES; i++) { |
5444 | if (zone_movable_pfn[i]) | 5385 | if (zone_movable_pfn[i]) |
5445 | pr_info(" Node %d: %#010lx\n", i, | 5386 | pr_info(" Node %d: %#018Lx\n", i, |
5446 | zone_movable_pfn[i] << PAGE_SHIFT); | 5387 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
5447 | } | 5388 | } |
5448 | 5389 | ||
5449 | /* Print out the early node map */ | 5390 | /* Print out the early node map */ |
5450 | pr_info("Early memory node ranges\n"); | 5391 | pr_info("Early memory node ranges\n"); |
5451 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5392 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5452 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5393 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
5453 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5394 | (u64)start_pfn << PAGE_SHIFT, |
5395 | ((u64)end_pfn << PAGE_SHIFT) - 1); | ||
5454 | 5396 | ||
5455 | /* Initialise every node */ | 5397 | /* Initialise every node */ |
5456 | mminit_verify_pageflags_layout(); | 5398 | mminit_verify_pageflags_layout(); |