diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 471 |
1 files changed, 216 insertions, 255 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e20f9c2fa5a..7abfa70cdc1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
28 | #include <linux/kasan.h> | ||
28 | #include <linux/module.h> | 29 | #include <linux/module.h> |
29 | #include <linux/suspend.h> | 30 | #include <linux/suspend.h> |
30 | #include <linux/pagevec.h> | 31 | #include <linux/pagevec.h> |
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
172 | * 1G machine -> (16M dma, 784M normal, 224M high) | 173 | * 1G machine -> (16M dma, 784M normal, 224M high) |
173 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 174 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
174 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 175 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
175 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 176 | * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
176 | * | 177 | * |
177 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 178 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
178 | * don't need any ZONE_NORMAL reservation | 179 | * don't need any ZONE_NORMAL reservation |
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) | |||
244 | PB_migrate, PB_migrate_end); | 245 | PB_migrate, PB_migrate_end); |
245 | } | 246 | } |
246 | 247 | ||
247 | bool oom_killer_disabled __read_mostly; | ||
248 | |||
249 | #ifdef CONFIG_DEBUG_VM | 248 | #ifdef CONFIG_DEBUG_VM |
250 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 249 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
251 | { | 250 | { |
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
381 | } | 380 | } |
382 | } | 381 | } |
383 | 382 | ||
384 | /* update __split_huge_page_refcount if you change this function */ | ||
385 | static int destroy_compound_page(struct page *page, unsigned long order) | ||
386 | { | ||
387 | int i; | ||
388 | int nr_pages = 1 << order; | ||
389 | int bad = 0; | ||
390 | |||
391 | if (unlikely(compound_order(page) != order)) { | ||
392 | bad_page(page, "wrong compound order", 0); | ||
393 | bad++; | ||
394 | } | ||
395 | |||
396 | __ClearPageHead(page); | ||
397 | |||
398 | for (i = 1; i < nr_pages; i++) { | ||
399 | struct page *p = page + i; | ||
400 | |||
401 | if (unlikely(!PageTail(p))) { | ||
402 | bad_page(page, "PageTail not set", 0); | ||
403 | bad++; | ||
404 | } else if (unlikely(p->first_page != page)) { | ||
405 | bad_page(page, "first_page not consistent", 0); | ||
406 | bad++; | ||
407 | } | ||
408 | __ClearPageTail(p); | ||
409 | } | ||
410 | |||
411 | return bad; | ||
412 | } | ||
413 | |||
414 | static inline void prep_zero_page(struct page *page, unsigned int order, | 383 | static inline void prep_zero_page(struct page *page, unsigned int order, |
415 | gfp_t gfp_flags) | 384 | gfp_t gfp_flags) |
416 | { | 385 | { |
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
552 | return 0; | 521 | return 0; |
553 | 522 | ||
554 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 523 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
555 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
556 | |||
557 | if (page_zone_id(page) != page_zone_id(buddy)) | 524 | if (page_zone_id(page) != page_zone_id(buddy)) |
558 | return 0; | 525 | return 0; |
559 | 526 | ||
527 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
528 | |||
560 | return 1; | 529 | return 1; |
561 | } | 530 | } |
562 | 531 | ||
563 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 532 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
564 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
565 | |||
566 | /* | 533 | /* |
567 | * zone check is done late to avoid uselessly | 534 | * zone check is done late to avoid uselessly |
568 | * calculating zone/node ids for pages that could | 535 | * calculating zone/node ids for pages that could |
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
571 | if (page_zone_id(page) != page_zone_id(buddy)) | 538 | if (page_zone_id(page) != page_zone_id(buddy)) |
572 | return 0; | 539 | return 0; |
573 | 540 | ||
541 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
542 | |||
574 | return 1; | 543 | return 1; |
575 | } | 544 | } |
576 | return 0; | 545 | return 0; |
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page, | |||
613 | int max_order = MAX_ORDER; | 582 | int max_order = MAX_ORDER; |
614 | 583 | ||
615 | VM_BUG_ON(!zone_is_initialized(zone)); | 584 | VM_BUG_ON(!zone_is_initialized(zone)); |
616 | 585 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | |
617 | if (unlikely(PageCompound(page))) | ||
618 | if (unlikely(destroy_compound_page(page, order))) | ||
619 | return; | ||
620 | 586 | ||
621 | VM_BUG_ON(migratetype == -1); | 587 | VM_BUG_ON(migratetype == -1); |
622 | if (is_migrate_isolate(migratetype)) { | 588 | if (is_migrate_isolate(migratetype)) { |
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone, | |||
797 | spin_unlock(&zone->lock); | 763 | spin_unlock(&zone->lock); |
798 | } | 764 | } |
799 | 765 | ||
766 | static int free_tail_pages_check(struct page *head_page, struct page *page) | ||
767 | { | ||
768 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | ||
769 | return 0; | ||
770 | if (unlikely(!PageTail(page))) { | ||
771 | bad_page(page, "PageTail not set", 0); | ||
772 | return 1; | ||
773 | } | ||
774 | if (unlikely(page->first_page != head_page)) { | ||
775 | bad_page(page, "first_page not consistent", 0); | ||
776 | return 1; | ||
777 | } | ||
778 | return 0; | ||
779 | } | ||
780 | |||
800 | static bool free_pages_prepare(struct page *page, unsigned int order) | 781 | static bool free_pages_prepare(struct page *page, unsigned int order) |
801 | { | 782 | { |
802 | int i; | 783 | bool compound = PageCompound(page); |
803 | int bad = 0; | 784 | int i, bad = 0; |
804 | 785 | ||
805 | VM_BUG_ON_PAGE(PageTail(page), page); | 786 | VM_BUG_ON_PAGE(PageTail(page), page); |
806 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | 787 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
807 | 788 | ||
808 | trace_mm_page_free(page, order); | 789 | trace_mm_page_free(page, order); |
809 | kmemcheck_free_shadow(page, order); | 790 | kmemcheck_free_shadow(page, order); |
791 | kasan_free_pages(page, order); | ||
810 | 792 | ||
811 | if (PageAnon(page)) | 793 | if (PageAnon(page)) |
812 | page->mapping = NULL; | 794 | page->mapping = NULL; |
813 | for (i = 0; i < (1 << order); i++) | 795 | bad += free_pages_check(page); |
796 | for (i = 1; i < (1 << order); i++) { | ||
797 | if (compound) | ||
798 | bad += free_tail_pages_check(page, page + i); | ||
814 | bad += free_pages_check(page + i); | 799 | bad += free_pages_check(page + i); |
800 | } | ||
815 | if (bad) | 801 | if (bad) |
816 | return false; | 802 | return false; |
817 | 803 | ||
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page) | |||
970 | return 0; | 956 | return 0; |
971 | } | 957 | } |
972 | 958 | ||
973 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | 959 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
960 | int alloc_flags) | ||
974 | { | 961 | { |
975 | int i; | 962 | int i; |
976 | 963 | ||
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
985 | 972 | ||
986 | arch_alloc_page(page, order); | 973 | arch_alloc_page(page, order); |
987 | kernel_map_pages(page, 1 << order, 1); | 974 | kernel_map_pages(page, 1 << order, 1); |
975 | kasan_alloc_pages(page, order); | ||
988 | 976 | ||
989 | if (gfp_flags & __GFP_ZERO) | 977 | if (gfp_flags & __GFP_ZERO) |
990 | prep_zero_page(page, order, gfp_flags); | 978 | prep_zero_page(page, order, gfp_flags); |
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
994 | 982 | ||
995 | set_page_owner(page, order, gfp_flags); | 983 | set_page_owner(page, order, gfp_flags); |
996 | 984 | ||
985 | /* | ||
986 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to | ||
987 | * allocate the page. The expectation is that the caller is taking | ||
988 | * steps that will free more memory. The caller should avoid the page | ||
989 | * being used for !PFMEMALLOC purposes. | ||
990 | */ | ||
991 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
992 | |||
997 | return 0; | 993 | return 0; |
998 | } | 994 | } |
999 | 995 | ||
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
1130 | } | 1126 | } |
1131 | 1127 | ||
1132 | /* | 1128 | /* |
1133 | * If breaking a large block of pages, move all free pages to the preferred | 1129 | * When we are falling back to another migratetype during allocation, try to |
1134 | * allocation list. If falling back for a reclaimable kernel allocation, be | 1130 | * steal extra free pages from the same pageblocks to satisfy further |
1135 | * more aggressive about taking ownership of free pages. | 1131 | * allocations, instead of polluting multiple pageblocks. |
1136 | * | 1132 | * |
1137 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | 1133 | * If we are stealing a relatively large buddy page, it is likely there will |
1138 | * nor move CMA pages to different free lists. We don't want unmovable pages | 1134 | * be more free pages in the pageblock, so try to steal them all. For |
1139 | * to be allocated from MIGRATE_CMA areas. | 1135 | * reclaimable and unmovable allocations, we steal regardless of page size, |
1136 | * as fragmentation caused by those allocations polluting movable pageblocks | ||
1137 | * is worse than movable allocations stealing from unmovable and reclaimable | ||
1138 | * pageblocks. | ||
1140 | * | 1139 | * |
1141 | * Returns the new migratetype of the pageblock (or the same old migratetype | 1140 | * If we claim more than half of the pageblock, change pageblock's migratetype |
1142 | * if it was unchanged). | 1141 | * as well. |
1143 | */ | 1142 | */ |
1144 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | 1143 | static void try_to_steal_freepages(struct zone *zone, struct page *page, |
1145 | int start_type, int fallback_type) | 1144 | int start_type, int fallback_type) |
1146 | { | 1145 | { |
1147 | int current_order = page_order(page); | 1146 | int current_order = page_order(page); |
1148 | 1147 | ||
1149 | /* | ||
1150 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1151 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
1152 | * is set to CMA so it is returned to the correct freelist in case | ||
1153 | * the page ends up being not actually allocated from the pcp lists. | ||
1154 | */ | ||
1155 | if (is_migrate_cma(fallback_type)) | ||
1156 | return fallback_type; | ||
1157 | |||
1158 | /* Take ownership for orders >= pageblock_order */ | 1148 | /* Take ownership for orders >= pageblock_order */ |
1159 | if (current_order >= pageblock_order) { | 1149 | if (current_order >= pageblock_order) { |
1160 | change_pageblock_range(page, current_order, start_type); | 1150 | change_pageblock_range(page, current_order, start_type); |
1161 | return start_type; | 1151 | return; |
1162 | } | 1152 | } |
1163 | 1153 | ||
1164 | if (current_order >= pageblock_order / 2 || | 1154 | if (current_order >= pageblock_order / 2 || |
1165 | start_type == MIGRATE_RECLAIMABLE || | 1155 | start_type == MIGRATE_RECLAIMABLE || |
1156 | start_type == MIGRATE_UNMOVABLE || | ||
1166 | page_group_by_mobility_disabled) { | 1157 | page_group_by_mobility_disabled) { |
1167 | int pages; | 1158 | int pages; |
1168 | 1159 | ||
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1170 | 1161 | ||
1171 | /* Claim the whole block if over half of it is free */ | 1162 | /* Claim the whole block if over half of it is free */ |
1172 | if (pages >= (1 << (pageblock_order-1)) || | 1163 | if (pages >= (1 << (pageblock_order-1)) || |
1173 | page_group_by_mobility_disabled) { | 1164 | page_group_by_mobility_disabled) |
1174 | |||
1175 | set_pageblock_migratetype(page, start_type); | 1165 | set_pageblock_migratetype(page, start_type); |
1176 | return start_type; | ||
1177 | } | ||
1178 | |||
1179 | } | 1166 | } |
1180 | |||
1181 | return fallback_type; | ||
1182 | } | 1167 | } |
1183 | 1168 | ||
1184 | /* Remove an element from the buddy allocator from the fallback list */ | 1169 | /* Remove an element from the buddy allocator from the fallback list */ |
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1188 | struct free_area *area; | 1173 | struct free_area *area; |
1189 | unsigned int current_order; | 1174 | unsigned int current_order; |
1190 | struct page *page; | 1175 | struct page *page; |
1191 | int migratetype, new_type, i; | ||
1192 | 1176 | ||
1193 | /* Find the largest possible block of pages in the other list */ | 1177 | /* Find the largest possible block of pages in the other list */ |
1194 | for (current_order = MAX_ORDER-1; | 1178 | for (current_order = MAX_ORDER-1; |
1195 | current_order >= order && current_order <= MAX_ORDER-1; | 1179 | current_order >= order && current_order <= MAX_ORDER-1; |
1196 | --current_order) { | 1180 | --current_order) { |
1181 | int i; | ||
1197 | for (i = 0;; i++) { | 1182 | for (i = 0;; i++) { |
1198 | migratetype = fallbacks[start_migratetype][i]; | 1183 | int migratetype = fallbacks[start_migratetype][i]; |
1184 | int buddy_type = start_migratetype; | ||
1199 | 1185 | ||
1200 | /* MIGRATE_RESERVE handled later if necessary */ | 1186 | /* MIGRATE_RESERVE handled later if necessary */ |
1201 | if (migratetype == MIGRATE_RESERVE) | 1187 | if (migratetype == MIGRATE_RESERVE) |
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1209 | struct page, lru); | 1195 | struct page, lru); |
1210 | area->nr_free--; | 1196 | area->nr_free--; |
1211 | 1197 | ||
1212 | new_type = try_to_steal_freepages(zone, page, | 1198 | if (!is_migrate_cma(migratetype)) { |
1213 | start_migratetype, | 1199 | try_to_steal_freepages(zone, page, |
1214 | migratetype); | 1200 | start_migratetype, |
1201 | migratetype); | ||
1202 | } else { | ||
1203 | /* | ||
1204 | * When borrowing from MIGRATE_CMA, we need to | ||
1205 | * release the excess buddy pages to CMA | ||
1206 | * itself, and we do not try to steal extra | ||
1207 | * free pages. | ||
1208 | */ | ||
1209 | buddy_type = migratetype; | ||
1210 | } | ||
1215 | 1211 | ||
1216 | /* Remove the page from the freelists */ | 1212 | /* Remove the page from the freelists */ |
1217 | list_del(&page->lru); | 1213 | list_del(&page->lru); |
1218 | rmv_page_order(page); | 1214 | rmv_page_order(page); |
1219 | 1215 | ||
1220 | expand(zone, page, order, current_order, area, | 1216 | expand(zone, page, order, current_order, area, |
1221 | new_type); | 1217 | buddy_type); |
1222 | /* The freepage_migratetype may differ from pageblock's | 1218 | |
1219 | /* | ||
1220 | * The freepage_migratetype may differ from pageblock's | ||
1223 | * migratetype depending on the decisions in | 1221 | * migratetype depending on the decisions in |
1224 | * try_to_steal_freepages. This is OK as long as it does | 1222 | * try_to_steal_freepages(). This is OK as long as it |
1225 | * not differ for MIGRATE_CMA type. | 1223 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
1224 | * we need to make sure unallocated pages flushed from | ||
1225 | * pcp lists are returned to the correct freelist. | ||
1226 | */ | 1226 | */ |
1227 | set_freepage_migratetype(page, new_type); | 1227 | set_freepage_migratetype(page, buddy_type); |
1228 | 1228 | ||
1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1230 | start_migratetype, migratetype, new_type); | 1230 | start_migratetype, migratetype); |
1231 | 1231 | ||
1232 | return page; | 1232 | return page; |
1233 | } | 1233 | } |
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page) | |||
1642 | } | 1642 | } |
1643 | 1643 | ||
1644 | /* | 1644 | /* |
1645 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1645 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
1646 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
1647 | * or two. | ||
1648 | */ | 1646 | */ |
1649 | static inline | 1647 | static inline |
1650 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1648 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1655 | struct page *page; | 1653 | struct page *page; |
1656 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 1654 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1657 | 1655 | ||
1658 | again: | ||
1659 | if (likely(order == 0)) { | 1656 | if (likely(order == 0)) { |
1660 | struct per_cpu_pages *pcp; | 1657 | struct per_cpu_pages *pcp; |
1661 | struct list_head *list; | 1658 | struct list_head *list; |
@@ -1711,8 +1708,6 @@ again: | |||
1711 | local_irq_restore(flags); | 1708 | local_irq_restore(flags); |
1712 | 1709 | ||
1713 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 1710 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
1714 | if (prep_new_page(page, order, gfp_flags)) | ||
1715 | goto again; | ||
1716 | return page; | 1711 | return page; |
1717 | 1712 | ||
1718 | failed: | 1713 | failed: |
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
2033 | * a page. | 2028 | * a page. |
2034 | */ | 2029 | */ |
2035 | static struct page * | 2030 | static struct page * |
2036 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 2031 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
2037 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 2032 | const struct alloc_context *ac) |
2038 | struct zone *preferred_zone, int classzone_idx, int migratetype) | ||
2039 | { | 2033 | { |
2034 | struct zonelist *zonelist = ac->zonelist; | ||
2040 | struct zoneref *z; | 2035 | struct zoneref *z; |
2041 | struct page *page = NULL; | 2036 | struct page *page = NULL; |
2042 | struct zone *zone; | 2037 | struct zone *zone; |
@@ -2055,8 +2050,8 @@ zonelist_scan: | |||
2055 | * Scan zonelist, looking for a zone with enough free. | 2050 | * Scan zonelist, looking for a zone with enough free. |
2056 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2051 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
2057 | */ | 2052 | */ |
2058 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2053 | for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
2059 | high_zoneidx, nodemask) { | 2054 | ac->nodemask) { |
2060 | unsigned long mark; | 2055 | unsigned long mark; |
2061 | 2056 | ||
2062 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 2057 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
@@ -2073,7 +2068,7 @@ zonelist_scan: | |||
2073 | * time the page has in memory before being reclaimed. | 2068 | * time the page has in memory before being reclaimed. |
2074 | */ | 2069 | */ |
2075 | if (alloc_flags & ALLOC_FAIR) { | 2070 | if (alloc_flags & ALLOC_FAIR) { |
2076 | if (!zone_local(preferred_zone, zone)) | 2071 | if (!zone_local(ac->preferred_zone, zone)) |
2077 | break; | 2072 | break; |
2078 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | 2073 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
2079 | nr_fair_skipped++; | 2074 | nr_fair_skipped++; |
@@ -2111,7 +2106,7 @@ zonelist_scan: | |||
2111 | 2106 | ||
2112 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2107 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
2113 | if (!zone_watermark_ok(zone, order, mark, | 2108 | if (!zone_watermark_ok(zone, order, mark, |
2114 | classzone_idx, alloc_flags)) { | 2109 | ac->classzone_idx, alloc_flags)) { |
2115 | int ret; | 2110 | int ret; |
2116 | 2111 | ||
2117 | /* Checked here to keep the fast path fast */ | 2112 | /* Checked here to keep the fast path fast */ |
@@ -2132,7 +2127,7 @@ zonelist_scan: | |||
2132 | } | 2127 | } |
2133 | 2128 | ||
2134 | if (zone_reclaim_mode == 0 || | 2129 | if (zone_reclaim_mode == 0 || |
2135 | !zone_allows_reclaim(preferred_zone, zone)) | 2130 | !zone_allows_reclaim(ac->preferred_zone, zone)) |
2136 | goto this_zone_full; | 2131 | goto this_zone_full; |
2137 | 2132 | ||
2138 | /* | 2133 | /* |
@@ -2154,7 +2149,7 @@ zonelist_scan: | |||
2154 | default: | 2149 | default: |
2155 | /* did we reclaim enough */ | 2150 | /* did we reclaim enough */ |
2156 | if (zone_watermark_ok(zone, order, mark, | 2151 | if (zone_watermark_ok(zone, order, mark, |
2157 | classzone_idx, alloc_flags)) | 2152 | ac->classzone_idx, alloc_flags)) |
2158 | goto try_this_zone; | 2153 | goto try_this_zone; |
2159 | 2154 | ||
2160 | /* | 2155 | /* |
@@ -2175,27 +2170,18 @@ zonelist_scan: | |||
2175 | } | 2170 | } |
2176 | 2171 | ||
2177 | try_this_zone: | 2172 | try_this_zone: |
2178 | page = buffered_rmqueue(preferred_zone, zone, order, | 2173 | page = buffered_rmqueue(ac->preferred_zone, zone, order, |
2179 | gfp_mask, migratetype); | 2174 | gfp_mask, ac->migratetype); |
2180 | if (page) | 2175 | if (page) { |
2181 | break; | 2176 | if (prep_new_page(page, order, gfp_mask, alloc_flags)) |
2177 | goto try_this_zone; | ||
2178 | return page; | ||
2179 | } | ||
2182 | this_zone_full: | 2180 | this_zone_full: |
2183 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) | 2181 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2184 | zlc_mark_zone_full(zonelist, z); | 2182 | zlc_mark_zone_full(zonelist, z); |
2185 | } | 2183 | } |
2186 | 2184 | ||
2187 | if (page) { | ||
2188 | /* | ||
2189 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2190 | * necessary to allocate the page. The expectation is | ||
2191 | * that the caller is taking steps that will free more | ||
2192 | * memory. The caller should avoid the page being used | ||
2193 | * for !PFMEMALLOC purposes. | ||
2194 | */ | ||
2195 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
2196 | return page; | ||
2197 | } | ||
2198 | |||
2199 | /* | 2185 | /* |
2200 | * The first pass makes sure allocations are spread fairly within the | 2186 | * The first pass makes sure allocations are spread fairly within the |
2201 | * local node. However, the local node might have free pages left | 2187 | * local node. However, the local node might have free pages left |
@@ -2208,7 +2194,7 @@ this_zone_full: | |||
2208 | alloc_flags &= ~ALLOC_FAIR; | 2194 | alloc_flags &= ~ALLOC_FAIR; |
2209 | if (nr_fair_skipped) { | 2195 | if (nr_fair_skipped) { |
2210 | zonelist_rescan = true; | 2196 | zonelist_rescan = true; |
2211 | reset_alloc_batches(preferred_zone); | 2197 | reset_alloc_batches(ac->preferred_zone); |
2212 | } | 2198 | } |
2213 | if (nr_online_nodes > 1) | 2199 | if (nr_online_nodes > 1) |
2214 | zonelist_rescan = true; | 2200 | zonelist_rescan = true; |
@@ -2330,44 +2316,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
2330 | 2316 | ||
2331 | static inline struct page * | 2317 | static inline struct page * |
2332 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2318 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2333 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2319 | const struct alloc_context *ac, unsigned long *did_some_progress) |
2334 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2335 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2336 | { | 2320 | { |
2337 | struct page *page; | 2321 | struct page *page; |
2338 | 2322 | ||
2339 | *did_some_progress = 0; | 2323 | *did_some_progress = 0; |
2340 | 2324 | ||
2341 | if (oom_killer_disabled) | ||
2342 | return NULL; | ||
2343 | |||
2344 | /* | 2325 | /* |
2345 | * Acquire the per-zone oom lock for each zone. If that | 2326 | * Acquire the per-zone oom lock for each zone. If that |
2346 | * fails, somebody else is making progress for us. | 2327 | * fails, somebody else is making progress for us. |
2347 | */ | 2328 | */ |
2348 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { | 2329 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { |
2349 | *did_some_progress = 1; | 2330 | *did_some_progress = 1; |
2350 | schedule_timeout_uninterruptible(1); | 2331 | schedule_timeout_uninterruptible(1); |
2351 | return NULL; | 2332 | return NULL; |
2352 | } | 2333 | } |
2353 | 2334 | ||
2354 | /* | 2335 | /* |
2355 | * PM-freezer should be notified that there might be an OOM killer on | ||
2356 | * its way to kill and wake somebody up. This is too early and we might | ||
2357 | * end up not killing anything but false positives are acceptable. | ||
2358 | * See freeze_processes. | ||
2359 | */ | ||
2360 | note_oom_kill(); | ||
2361 | |||
2362 | /* | ||
2363 | * Go through the zonelist yet one more time, keep very high watermark | 2336 | * Go through the zonelist yet one more time, keep very high watermark |
2364 | * here, this is only to catch a parallel oom killing, we must fail if | 2337 | * here, this is only to catch a parallel oom killing, we must fail if |
2365 | * we're still under heavy pressure. | 2338 | * we're still under heavy pressure. |
2366 | */ | 2339 | */ |
2367 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2340 | page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, |
2368 | order, zonelist, high_zoneidx, | 2341 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); |
2369 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
2370 | preferred_zone, classzone_idx, migratetype); | ||
2371 | if (page) | 2342 | if (page) |
2372 | goto out; | 2343 | goto out; |
2373 | 2344 | ||
@@ -2379,11 +2350,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2379 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2350 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2380 | goto out; | 2351 | goto out; |
2381 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2352 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2382 | if (high_zoneidx < ZONE_NORMAL) | 2353 | if (ac->high_zoneidx < ZONE_NORMAL) |
2383 | goto out; | 2354 | goto out; |
2384 | /* The OOM killer does not compensate for light reclaim */ | 2355 | /* The OOM killer does not compensate for light reclaim */ |
2385 | if (!(gfp_mask & __GFP_FS)) | 2356 | if (!(gfp_mask & __GFP_FS)) { |
2357 | /* | ||
2358 | * XXX: Page reclaim didn't yield anything, | ||
2359 | * and the OOM killer can't be invoked, but | ||
2360 | * keep looping as per should_alloc_retry(). | ||
2361 | */ | ||
2362 | *did_some_progress = 1; | ||
2386 | goto out; | 2363 | goto out; |
2364 | } | ||
2387 | /* | 2365 | /* |
2388 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2366 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
2389 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | 2367 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. |
@@ -2395,10 +2373,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2395 | goto out; | 2373 | goto out; |
2396 | } | 2374 | } |
2397 | /* Exhausted what can be done so it's blamo time */ | 2375 | /* Exhausted what can be done so it's blamo time */ |
2398 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2376 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) |
2399 | *did_some_progress = 1; | 2377 | *did_some_progress = 1; |
2400 | out: | 2378 | out: |
2401 | oom_zonelist_unlock(zonelist, gfp_mask); | 2379 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
2402 | return page; | 2380 | return page; |
2403 | } | 2381 | } |
2404 | 2382 | ||
@@ -2406,10 +2384,9 @@ out: | |||
2406 | /* Try memory compaction for high-order allocations before reclaim */ | 2384 | /* Try memory compaction for high-order allocations before reclaim */ |
2407 | static struct page * | 2385 | static struct page * |
2408 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2386 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2409 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2387 | int alloc_flags, const struct alloc_context *ac, |
2410 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2388 | enum migrate_mode mode, int *contended_compaction, |
2411 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2389 | bool *deferred_compaction) |
2412 | int *contended_compaction, bool *deferred_compaction) | ||
2413 | { | 2390 | { |
2414 | unsigned long compact_result; | 2391 | unsigned long compact_result; |
2415 | struct page *page; | 2392 | struct page *page; |
@@ -2418,10 +2395,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2418 | return NULL; | 2395 | return NULL; |
2419 | 2396 | ||
2420 | current->flags |= PF_MEMALLOC; | 2397 | current->flags |= PF_MEMALLOC; |
2421 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2398 | compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
2422 | nodemask, mode, | 2399 | mode, contended_compaction); |
2423 | contended_compaction, | ||
2424 | alloc_flags, classzone_idx); | ||
2425 | current->flags &= ~PF_MEMALLOC; | 2400 | current->flags &= ~PF_MEMALLOC; |
2426 | 2401 | ||
2427 | switch (compact_result) { | 2402 | switch (compact_result) { |
@@ -2440,10 +2415,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2440 | */ | 2415 | */ |
2441 | count_vm_event(COMPACTSTALL); | 2416 | count_vm_event(COMPACTSTALL); |
2442 | 2417 | ||
2443 | page = get_page_from_freelist(gfp_mask, nodemask, | 2418 | page = get_page_from_freelist(gfp_mask, order, |
2444 | order, zonelist, high_zoneidx, | 2419 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2445 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2446 | preferred_zone, classzone_idx, migratetype); | ||
2447 | 2420 | ||
2448 | if (page) { | 2421 | if (page) { |
2449 | struct zone *zone = page_zone(page); | 2422 | struct zone *zone = page_zone(page); |
@@ -2467,10 +2440,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2467 | #else | 2440 | #else |
2468 | static inline struct page * | 2441 | static inline struct page * |
2469 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2442 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2470 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2443 | int alloc_flags, const struct alloc_context *ac, |
2471 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2444 | enum migrate_mode mode, int *contended_compaction, |
2472 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2445 | bool *deferred_compaction) |
2473 | int *contended_compaction, bool *deferred_compaction) | ||
2474 | { | 2446 | { |
2475 | return NULL; | 2447 | return NULL; |
2476 | } | 2448 | } |
@@ -2478,8 +2450,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2478 | 2450 | ||
2479 | /* Perform direct synchronous page reclaim */ | 2451 | /* Perform direct synchronous page reclaim */ |
2480 | static int | 2452 | static int |
2481 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2453 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
2482 | nodemask_t *nodemask) | 2454 | const struct alloc_context *ac) |
2483 | { | 2455 | { |
2484 | struct reclaim_state reclaim_state; | 2456 | struct reclaim_state reclaim_state; |
2485 | int progress; | 2457 | int progress; |
@@ -2493,7 +2465,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2493 | reclaim_state.reclaimed_slab = 0; | 2465 | reclaim_state.reclaimed_slab = 0; |
2494 | current->reclaim_state = &reclaim_state; | 2466 | current->reclaim_state = &reclaim_state; |
2495 | 2467 | ||
2496 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2468 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
2469 | ac->nodemask); | ||
2497 | 2470 | ||
2498 | current->reclaim_state = NULL; | 2471 | current->reclaim_state = NULL; |
2499 | lockdep_clear_current_reclaim_state(); | 2472 | lockdep_clear_current_reclaim_state(); |
@@ -2507,28 +2480,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2507 | /* The really slow allocator path where we enter direct reclaim */ | 2480 | /* The really slow allocator path where we enter direct reclaim */ |
2508 | static inline struct page * | 2481 | static inline struct page * |
2509 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2482 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2510 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2483 | int alloc_flags, const struct alloc_context *ac, |
2511 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2484 | unsigned long *did_some_progress) |
2512 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2513 | { | 2485 | { |
2514 | struct page *page = NULL; | 2486 | struct page *page = NULL; |
2515 | bool drained = false; | 2487 | bool drained = false; |
2516 | 2488 | ||
2517 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2489 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
2518 | nodemask); | ||
2519 | if (unlikely(!(*did_some_progress))) | 2490 | if (unlikely(!(*did_some_progress))) |
2520 | return NULL; | 2491 | return NULL; |
2521 | 2492 | ||
2522 | /* After successful reclaim, reconsider all zones for allocation */ | 2493 | /* After successful reclaim, reconsider all zones for allocation */ |
2523 | if (IS_ENABLED(CONFIG_NUMA)) | 2494 | if (IS_ENABLED(CONFIG_NUMA)) |
2524 | zlc_clear_zones_full(zonelist); | 2495 | zlc_clear_zones_full(ac->zonelist); |
2525 | 2496 | ||
2526 | retry: | 2497 | retry: |
2527 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2498 | page = get_page_from_freelist(gfp_mask, order, |
2528 | zonelist, high_zoneidx, | 2499 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2529 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2530 | preferred_zone, classzone_idx, | ||
2531 | migratetype); | ||
2532 | 2500 | ||
2533 | /* | 2501 | /* |
2534 | * If an allocation failed after direct reclaim, it could be because | 2502 | * If an allocation failed after direct reclaim, it could be because |
@@ -2549,36 +2517,30 @@ retry: | |||
2549 | */ | 2517 | */ |
2550 | static inline struct page * | 2518 | static inline struct page * |
2551 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2519 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2552 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2520 | const struct alloc_context *ac) |
2553 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2554 | int classzone_idx, int migratetype) | ||
2555 | { | 2521 | { |
2556 | struct page *page; | 2522 | struct page *page; |
2557 | 2523 | ||
2558 | do { | 2524 | do { |
2559 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2525 | page = get_page_from_freelist(gfp_mask, order, |
2560 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2526 | ALLOC_NO_WATERMARKS, ac); |
2561 | preferred_zone, classzone_idx, migratetype); | ||
2562 | 2527 | ||
2563 | if (!page && gfp_mask & __GFP_NOFAIL) | 2528 | if (!page && gfp_mask & __GFP_NOFAIL) |
2564 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2529 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, |
2530 | HZ/50); | ||
2565 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2531 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2566 | 2532 | ||
2567 | return page; | 2533 | return page; |
2568 | } | 2534 | } |
2569 | 2535 | ||
2570 | static void wake_all_kswapds(unsigned int order, | 2536 | static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) |
2571 | struct zonelist *zonelist, | ||
2572 | enum zone_type high_zoneidx, | ||
2573 | struct zone *preferred_zone, | ||
2574 | nodemask_t *nodemask) | ||
2575 | { | 2537 | { |
2576 | struct zoneref *z; | 2538 | struct zoneref *z; |
2577 | struct zone *zone; | 2539 | struct zone *zone; |
2578 | 2540 | ||
2579 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2541 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
2580 | high_zoneidx, nodemask) | 2542 | ac->high_zoneidx, ac->nodemask) |
2581 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2543 | wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); |
2582 | } | 2544 | } |
2583 | 2545 | ||
2584 | static inline int | 2546 | static inline int |
@@ -2637,9 +2599,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
2637 | 2599 | ||
2638 | static inline struct page * | 2600 | static inline struct page * |
2639 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2601 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2640 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2602 | struct alloc_context *ac) |
2641 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2642 | int classzone_idx, int migratetype) | ||
2643 | { | 2603 | { |
2644 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2604 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2645 | struct page *page = NULL; | 2605 | struct page *page = NULL; |
@@ -2675,8 +2635,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2675 | 2635 | ||
2676 | retry: | 2636 | retry: |
2677 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2637 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2678 | wake_all_kswapds(order, zonelist, high_zoneidx, | 2638 | wake_all_kswapds(order, ac); |
2679 | preferred_zone, nodemask); | ||
2680 | 2639 | ||
2681 | /* | 2640 | /* |
2682 | * OK, we're below the kswapd watermark and have kicked background | 2641 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2689,17 +2648,16 @@ retry: | |||
2689 | * Find the true preferred zone if the allocation is unconstrained by | 2648 | * Find the true preferred zone if the allocation is unconstrained by |
2690 | * cpusets. | 2649 | * cpusets. |
2691 | */ | 2650 | */ |
2692 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { | 2651 | if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { |
2693 | struct zoneref *preferred_zoneref; | 2652 | struct zoneref *preferred_zoneref; |
2694 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2653 | preferred_zoneref = first_zones_zonelist(ac->zonelist, |
2695 | NULL, &preferred_zone); | 2654 | ac->high_zoneidx, NULL, &ac->preferred_zone); |
2696 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2655 | ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2697 | } | 2656 | } |
2698 | 2657 | ||
2699 | /* This is the last chance, in general, before the goto nopage. */ | 2658 | /* This is the last chance, in general, before the goto nopage. */ |
2700 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2659 | page = get_page_from_freelist(gfp_mask, order, |
2701 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2660 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2702 | preferred_zone, classzone_idx, migratetype); | ||
2703 | if (page) | 2661 | if (page) |
2704 | goto got_pg; | 2662 | goto got_pg; |
2705 | 2663 | ||
@@ -2710,11 +2668,10 @@ retry: | |||
2710 | * the allocation is high priority and these type of | 2668 | * the allocation is high priority and these type of |
2711 | * allocations are system rather than user orientated | 2669 | * allocations are system rather than user orientated |
2712 | */ | 2670 | */ |
2713 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | 2671 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
2672 | |||
2673 | page = __alloc_pages_high_priority(gfp_mask, order, ac); | ||
2714 | 2674 | ||
2715 | page = __alloc_pages_high_priority(gfp_mask, order, | ||
2716 | zonelist, high_zoneidx, nodemask, | ||
2717 | preferred_zone, classzone_idx, migratetype); | ||
2718 | if (page) { | 2675 | if (page) { |
2719 | goto got_pg; | 2676 | goto got_pg; |
2720 | } | 2677 | } |
@@ -2743,11 +2700,9 @@ retry: | |||
2743 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2700 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2744 | * attempts after direct reclaim are synchronous | 2701 | * attempts after direct reclaim are synchronous |
2745 | */ | 2702 | */ |
2746 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2703 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, |
2747 | high_zoneidx, nodemask, alloc_flags, | 2704 | migration_mode, |
2748 | preferred_zone, | 2705 | &contended_compaction, |
2749 | classzone_idx, migratetype, | ||
2750 | migration_mode, &contended_compaction, | ||
2751 | &deferred_compaction); | 2706 | &deferred_compaction); |
2752 | if (page) | 2707 | if (page) |
2753 | goto got_pg; | 2708 | goto got_pg; |
@@ -2793,12 +2748,8 @@ retry: | |||
2793 | migration_mode = MIGRATE_SYNC_LIGHT; | 2748 | migration_mode = MIGRATE_SYNC_LIGHT; |
2794 | 2749 | ||
2795 | /* Try direct reclaim and then allocating */ | 2750 | /* Try direct reclaim and then allocating */ |
2796 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2751 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
2797 | zonelist, high_zoneidx, | 2752 | &did_some_progress); |
2798 | nodemask, | ||
2799 | alloc_flags, preferred_zone, | ||
2800 | classzone_idx, migratetype, | ||
2801 | &did_some_progress); | ||
2802 | if (page) | 2753 | if (page) |
2803 | goto got_pg; | 2754 | goto got_pg; |
2804 | 2755 | ||
@@ -2812,17 +2763,15 @@ retry: | |||
2812 | * start OOM killing tasks. | 2763 | * start OOM killing tasks. |
2813 | */ | 2764 | */ |
2814 | if (!did_some_progress) { | 2765 | if (!did_some_progress) { |
2815 | page = __alloc_pages_may_oom(gfp_mask, order, zonelist, | 2766 | page = __alloc_pages_may_oom(gfp_mask, order, ac, |
2816 | high_zoneidx, nodemask, | 2767 | &did_some_progress); |
2817 | preferred_zone, classzone_idx, | ||
2818 | migratetype,&did_some_progress); | ||
2819 | if (page) | 2768 | if (page) |
2820 | goto got_pg; | 2769 | goto got_pg; |
2821 | if (!did_some_progress) | 2770 | if (!did_some_progress) |
2822 | goto nopage; | 2771 | goto nopage; |
2823 | } | 2772 | } |
2824 | /* Wait for some write requests to complete then retry */ | 2773 | /* Wait for some write requests to complete then retry */ |
2825 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2774 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
2826 | goto retry; | 2775 | goto retry; |
2827 | } else { | 2776 | } else { |
2828 | /* | 2777 | /* |
@@ -2830,11 +2779,9 @@ retry: | |||
2830 | * direct reclaim and reclaim/compaction depends on compaction | 2779 | * direct reclaim and reclaim/compaction depends on compaction |
2831 | * being called after reclaim so call directly if necessary | 2780 | * being called after reclaim so call directly if necessary |
2832 | */ | 2781 | */ |
2833 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2782 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2834 | high_zoneidx, nodemask, alloc_flags, | 2783 | alloc_flags, ac, migration_mode, |
2835 | preferred_zone, | 2784 | &contended_compaction, |
2836 | classzone_idx, migratetype, | ||
2837 | migration_mode, &contended_compaction, | ||
2838 | &deferred_compaction); | 2785 | &deferred_compaction); |
2839 | if (page) | 2786 | if (page) |
2840 | goto got_pg; | 2787 | goto got_pg; |
@@ -2842,11 +2789,7 @@ retry: | |||
2842 | 2789 | ||
2843 | nopage: | 2790 | nopage: |
2844 | warn_alloc_failed(gfp_mask, order, NULL); | 2791 | warn_alloc_failed(gfp_mask, order, NULL); |
2845 | return page; | ||
2846 | got_pg: | 2792 | got_pg: |
2847 | if (kmemcheck_enabled) | ||
2848 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2849 | |||
2850 | return page; | 2793 | return page; |
2851 | } | 2794 | } |
2852 | 2795 | ||
@@ -2857,14 +2800,16 @@ struct page * | |||
2857 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2800 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2858 | struct zonelist *zonelist, nodemask_t *nodemask) | 2801 | struct zonelist *zonelist, nodemask_t *nodemask) |
2859 | { | 2802 | { |
2860 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
2861 | struct zone *preferred_zone; | ||
2862 | struct zoneref *preferred_zoneref; | 2803 | struct zoneref *preferred_zoneref; |
2863 | struct page *page = NULL; | 2804 | struct page *page = NULL; |
2864 | int migratetype = gfpflags_to_migratetype(gfp_mask); | ||
2865 | unsigned int cpuset_mems_cookie; | 2805 | unsigned int cpuset_mems_cookie; |
2866 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2806 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2867 | int classzone_idx; | 2807 | gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ |
2808 | struct alloc_context ac = { | ||
2809 | .high_zoneidx = gfp_zone(gfp_mask), | ||
2810 | .nodemask = nodemask, | ||
2811 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
2812 | }; | ||
2868 | 2813 | ||
2869 | gfp_mask &= gfp_allowed_mask; | 2814 | gfp_mask &= gfp_allowed_mask; |
2870 | 2815 | ||
@@ -2883,37 +2828,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2883 | if (unlikely(!zonelist->_zonerefs->zone)) | 2828 | if (unlikely(!zonelist->_zonerefs->zone)) |
2884 | return NULL; | 2829 | return NULL; |
2885 | 2830 | ||
2886 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | 2831 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
2887 | alloc_flags |= ALLOC_CMA; | 2832 | alloc_flags |= ALLOC_CMA; |
2888 | 2833 | ||
2889 | retry_cpuset: | 2834 | retry_cpuset: |
2890 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2835 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2891 | 2836 | ||
2837 | /* We set it here, as __alloc_pages_slowpath might have changed it */ | ||
2838 | ac.zonelist = zonelist; | ||
2892 | /* The preferred zone is used for statistics later */ | 2839 | /* The preferred zone is used for statistics later */ |
2893 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2840 | preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, |
2894 | nodemask ? : &cpuset_current_mems_allowed, | 2841 | ac.nodemask ? : &cpuset_current_mems_allowed, |
2895 | &preferred_zone); | 2842 | &ac.preferred_zone); |
2896 | if (!preferred_zone) | 2843 | if (!ac.preferred_zone) |
2897 | goto out; | 2844 | goto out; |
2898 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2845 | ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2899 | 2846 | ||
2900 | /* First allocation attempt */ | 2847 | /* First allocation attempt */ |
2901 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2848 | alloc_mask = gfp_mask|__GFP_HARDWALL; |
2902 | zonelist, high_zoneidx, alloc_flags, | 2849 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
2903 | preferred_zone, classzone_idx, migratetype); | ||
2904 | if (unlikely(!page)) { | 2850 | if (unlikely(!page)) { |
2905 | /* | 2851 | /* |
2906 | * Runtime PM, block IO and its error handling path | 2852 | * Runtime PM, block IO and its error handling path |
2907 | * can deadlock because I/O on the device might not | 2853 | * can deadlock because I/O on the device might not |
2908 | * complete. | 2854 | * complete. |
2909 | */ | 2855 | */ |
2910 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2856 | alloc_mask = memalloc_noio_flags(gfp_mask); |
2911 | page = __alloc_pages_slowpath(gfp_mask, order, | 2857 | |
2912 | zonelist, high_zoneidx, nodemask, | 2858 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
2913 | preferred_zone, classzone_idx, migratetype); | ||
2914 | } | 2859 | } |
2915 | 2860 | ||
2916 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2861 | if (kmemcheck_enabled && page) |
2862 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2863 | |||
2864 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | ||
2917 | 2865 | ||
2918 | out: | 2866 | out: |
2919 | /* | 2867 | /* |
@@ -3933,18 +3881,29 @@ static int __build_all_zonelists(void *data) | |||
3933 | return 0; | 3881 | return 0; |
3934 | } | 3882 | } |
3935 | 3883 | ||
3884 | static noinline void __init | ||
3885 | build_all_zonelists_init(void) | ||
3886 | { | ||
3887 | __build_all_zonelists(NULL); | ||
3888 | mminit_verify_zonelist(); | ||
3889 | cpuset_init_current_mems_allowed(); | ||
3890 | } | ||
3891 | |||
3936 | /* | 3892 | /* |
3937 | * Called with zonelists_mutex held always | 3893 | * Called with zonelists_mutex held always |
3938 | * unless system_state == SYSTEM_BOOTING. | 3894 | * unless system_state == SYSTEM_BOOTING. |
3895 | * | ||
3896 | * __ref due to (1) call of __meminit annotated setup_zone_pageset | ||
3897 | * [we're only called with non-NULL zone through __meminit paths] and | ||
3898 | * (2) call of __init annotated helper build_all_zonelists_init | ||
3899 | * [protected by SYSTEM_BOOTING]. | ||
3939 | */ | 3900 | */ |
3940 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 3901 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3941 | { | 3902 | { |
3942 | set_zonelist_order(); | 3903 | set_zonelist_order(); |
3943 | 3904 | ||
3944 | if (system_state == SYSTEM_BOOTING) { | 3905 | if (system_state == SYSTEM_BOOTING) { |
3945 | __build_all_zonelists(NULL); | 3906 | build_all_zonelists_init(); |
3946 | mminit_verify_zonelist(); | ||
3947 | cpuset_init_current_mems_allowed(); | ||
3948 | } else { | 3907 | } else { |
3949 | #ifdef CONFIG_MEMORY_HOTPLUG | 3908 | #ifdef CONFIG_MEMORY_HOTPLUG |
3950 | if (zone) | 3909 | if (zone) |
@@ -5047,8 +5006,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5047 | pgdat->node_start_pfn = node_start_pfn; | 5006 | pgdat->node_start_pfn = node_start_pfn; |
5048 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5007 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
5049 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5008 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
5050 | printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, | 5009 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5051 | (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); | 5010 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); |
5052 | #endif | 5011 | #endif |
5053 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5012 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5054 | zones_size, zholes_size); | 5013 | zones_size, zholes_size); |
@@ -5420,9 +5379,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5420 | arch_zone_highest_possible_pfn[i]) | 5379 | arch_zone_highest_possible_pfn[i]) |
5421 | pr_cont("empty\n"); | 5380 | pr_cont("empty\n"); |
5422 | else | 5381 | else |
5423 | pr_cont("[mem %0#10lx-%0#10lx]\n", | 5382 | pr_cont("[mem %#018Lx-%#018Lx]\n", |
5424 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5383 | (u64)arch_zone_lowest_possible_pfn[i] |
5425 | (arch_zone_highest_possible_pfn[i] | 5384 | << PAGE_SHIFT, |
5385 | ((u64)arch_zone_highest_possible_pfn[i] | ||
5426 | << PAGE_SHIFT) - 1); | 5386 | << PAGE_SHIFT) - 1); |
5427 | } | 5387 | } |
5428 | 5388 | ||
@@ -5430,15 +5390,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5430 | pr_info("Movable zone start for each node\n"); | 5390 | pr_info("Movable zone start for each node\n"); |
5431 | for (i = 0; i < MAX_NUMNODES; i++) { | 5391 | for (i = 0; i < MAX_NUMNODES; i++) { |
5432 | if (zone_movable_pfn[i]) | 5392 | if (zone_movable_pfn[i]) |
5433 | pr_info(" Node %d: %#010lx\n", i, | 5393 | pr_info(" Node %d: %#018Lx\n", i, |
5434 | zone_movable_pfn[i] << PAGE_SHIFT); | 5394 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
5435 | } | 5395 | } |
5436 | 5396 | ||
5437 | /* Print out the early node map */ | 5397 | /* Print out the early node map */ |
5438 | pr_info("Early memory node ranges\n"); | 5398 | pr_info("Early memory node ranges\n"); |
5439 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5399 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5440 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5400 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
5441 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5401 | (u64)start_pfn << PAGE_SHIFT, |
5402 | ((u64)end_pfn << PAGE_SHIFT) - 1); | ||
5442 | 5403 | ||
5443 | /* Initialise every node */ | 5404 | /* Initialise every node */ |
5444 | mminit_verify_pageflags_layout(); | 5405 | mminit_verify_pageflags_layout(); |