diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 533 |
1 files changed, 417 insertions, 116 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb64f7..59de90d5d3a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
223 | #endif | 223 | #endif |
224 | }; | 224 | }; |
225 | 225 | ||
226 | char * const migratetype_names[MIGRATE_TYPES] = { | ||
227 | "Unmovable", | ||
228 | "Movable", | ||
229 | "Reclaimable", | ||
230 | "HighAtomic", | ||
231 | #ifdef CONFIG_CMA | ||
232 | "CMA", | ||
233 | #endif | ||
234 | #ifdef CONFIG_MEMORY_ISOLATION | ||
235 | "Isolate", | ||
236 | #endif | ||
237 | }; | ||
238 | |||
226 | compound_page_dtor * const compound_page_dtors[] = { | 239 | compound_page_dtor * const compound_page_dtors[] = { |
227 | NULL, | 240 | NULL, |
228 | free_compound_page, | 241 | free_compound_page, |
@@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { | |||
236 | 249 | ||
237 | int min_free_kbytes = 1024; | 250 | int min_free_kbytes = 1024; |
238 | int user_min_free_kbytes = -1; | 251 | int user_min_free_kbytes = -1; |
252 | int watermark_scale_factor = 10; | ||
239 | 253 | ||
240 | static unsigned long __meminitdata nr_kernel_pages; | 254 | static unsigned long __meminitdata nr_kernel_pages; |
241 | static unsigned long __meminitdata nr_all_pages; | 255 | static unsigned long __meminitdata nr_all_pages; |
@@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | |||
247 | static unsigned long __initdata required_kernelcore; | 261 | static unsigned long __initdata required_kernelcore; |
248 | static unsigned long __initdata required_movablecore; | 262 | static unsigned long __initdata required_movablecore; |
249 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 263 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
264 | static bool mirrored_kernelcore; | ||
250 | 265 | ||
251 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 266 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
252 | int movable_zone; | 267 | int movable_zone; |
@@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, | |||
293 | unsigned long pfn, unsigned long zone_end, | 308 | unsigned long pfn, unsigned long zone_end, |
294 | unsigned long *nr_initialised) | 309 | unsigned long *nr_initialised) |
295 | { | 310 | { |
311 | unsigned long max_initialise; | ||
312 | |||
296 | /* Always populate low zones for address-contrained allocations */ | 313 | /* Always populate low zones for address-contrained allocations */ |
297 | if (zone_end < pgdat_end_pfn(pgdat)) | 314 | if (zone_end < pgdat_end_pfn(pgdat)) |
298 | return true; | 315 | return true; |
316 | /* | ||
317 | * Initialise at least 2G of a node but also take into account that | ||
318 | * two large system hashes that can take up 1GB for 0.25TB/node. | ||
319 | */ | ||
320 | max_initialise = max(2UL << (30 - PAGE_SHIFT), | ||
321 | (pgdat->node_spanned_pages >> 8)); | ||
299 | 322 | ||
300 | /* Initialise at least 2G of the highest zone */ | ||
301 | (*nr_initialised)++; | 323 | (*nr_initialised)++; |
302 | if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && | 324 | if ((*nr_initialised > max_initialise) && |
303 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { | 325 | (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
304 | pgdat->first_deferred_pfn = pfn; | 326 | pgdat->first_deferred_pfn = pfn; |
305 | return false; | 327 | return false; |
@@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason, | |||
416 | goto out; | 438 | goto out; |
417 | } | 439 | } |
418 | if (nr_unshown) { | 440 | if (nr_unshown) { |
419 | printk(KERN_ALERT | 441 | pr_alert( |
420 | "BUG: Bad page state: %lu messages suppressed\n", | 442 | "BUG: Bad page state: %lu messages suppressed\n", |
421 | nr_unshown); | 443 | nr_unshown); |
422 | nr_unshown = 0; | 444 | nr_unshown = 0; |
@@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason, | |||
426 | if (nr_shown++ == 0) | 448 | if (nr_shown++ == 0) |
427 | resume = jiffies + 60 * HZ; | 449 | resume = jiffies + 60 * HZ; |
428 | 450 | ||
429 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 451 | pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
430 | current->comm, page_to_pfn(page)); | 452 | current->comm, page_to_pfn(page)); |
431 | dump_page_badflags(page, reason, bad_flags); | 453 | __dump_page(page, reason); |
454 | bad_flags &= page->flags; | ||
455 | if (bad_flags) | ||
456 | pr_alert("bad because of flags: %#lx(%pGp)\n", | ||
457 | bad_flags, &bad_flags); | ||
458 | dump_page_owner(page); | ||
432 | 459 | ||
433 | print_modules(); | 460 | print_modules(); |
434 | dump_stack(); | 461 | dump_stack(); |
@@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order) | |||
477 | 504 | ||
478 | #ifdef CONFIG_DEBUG_PAGEALLOC | 505 | #ifdef CONFIG_DEBUG_PAGEALLOC |
479 | unsigned int _debug_guardpage_minorder; | 506 | unsigned int _debug_guardpage_minorder; |
480 | bool _debug_pagealloc_enabled __read_mostly; | 507 | bool _debug_pagealloc_enabled __read_mostly |
508 | = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); | ||
509 | EXPORT_SYMBOL(_debug_pagealloc_enabled); | ||
481 | bool _debug_guardpage_enabled __read_mostly; | 510 | bool _debug_guardpage_enabled __read_mostly; |
482 | 511 | ||
483 | static int __init early_debug_pagealloc(char *buf) | 512 | static int __init early_debug_pagealloc(char *buf) |
@@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf) | |||
488 | if (strcmp(buf, "on") == 0) | 517 | if (strcmp(buf, "on") == 0) |
489 | _debug_pagealloc_enabled = true; | 518 | _debug_pagealloc_enabled = true; |
490 | 519 | ||
520 | if (strcmp(buf, "off") == 0) | ||
521 | _debug_pagealloc_enabled = false; | ||
522 | |||
491 | return 0; | 523 | return 0; |
492 | } | 524 | } |
493 | early_param("debug_pagealloc", early_debug_pagealloc); | 525 | early_param("debug_pagealloc", early_debug_pagealloc); |
@@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) | |||
519 | unsigned long res; | 551 | unsigned long res; |
520 | 552 | ||
521 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { | 553 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { |
522 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); | 554 | pr_err("Bad debug_guardpage_minorder value\n"); |
523 | return 0; | 555 | return 0; |
524 | } | 556 | } |
525 | _debug_guardpage_minorder = res; | 557 | _debug_guardpage_minorder = res; |
526 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); | 558 | pr_info("Setting debug_guardpage_minorder to %lu\n", res); |
527 | return 0; | 559 | return 0; |
528 | } | 560 | } |
529 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 561 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
@@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page, | |||
660 | unsigned long combined_idx; | 692 | unsigned long combined_idx; |
661 | unsigned long uninitialized_var(buddy_idx); | 693 | unsigned long uninitialized_var(buddy_idx); |
662 | struct page *buddy; | 694 | struct page *buddy; |
663 | unsigned int max_order = MAX_ORDER; | 695 | unsigned int max_order; |
696 | |||
697 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
664 | 698 | ||
665 | VM_BUG_ON(!zone_is_initialized(zone)); | 699 | VM_BUG_ON(!zone_is_initialized(zone)); |
666 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | 700 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
667 | 701 | ||
668 | VM_BUG_ON(migratetype == -1); | 702 | VM_BUG_ON(migratetype == -1); |
669 | if (is_migrate_isolate(migratetype)) { | 703 | if (likely(!is_migrate_isolate(migratetype))) |
670 | /* | ||
671 | * We restrict max order of merging to prevent merge | ||
672 | * between freepages on isolate pageblock and normal | ||
673 | * pageblock. Without this, pageblock isolation | ||
674 | * could cause incorrect freepage accounting. | ||
675 | */ | ||
676 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | ||
677 | } else { | ||
678 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | 704 | __mod_zone_freepage_state(zone, 1 << order, migratetype); |
679 | } | ||
680 | 705 | ||
681 | page_idx = pfn & ((1 << max_order) - 1); | 706 | page_idx = pfn & ((1 << MAX_ORDER) - 1); |
682 | 707 | ||
683 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); | 708 | VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); |
684 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 709 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
685 | 710 | ||
711 | continue_merging: | ||
686 | while (order < max_order - 1) { | 712 | while (order < max_order - 1) { |
687 | buddy_idx = __find_buddy_index(page_idx, order); | 713 | buddy_idx = __find_buddy_index(page_idx, order); |
688 | buddy = page + (buddy_idx - page_idx); | 714 | buddy = page + (buddy_idx - page_idx); |
689 | if (!page_is_buddy(page, buddy, order)) | 715 | if (!page_is_buddy(page, buddy, order)) |
690 | break; | 716 | goto done_merging; |
691 | /* | 717 | /* |
692 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, | 718 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
693 | * merge with it and move up one order. | 719 | * merge with it and move up one order. |
@@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page, | |||
704 | page_idx = combined_idx; | 730 | page_idx = combined_idx; |
705 | order++; | 731 | order++; |
706 | } | 732 | } |
733 | if (max_order < MAX_ORDER) { | ||
734 | /* If we are here, it means order is >= pageblock_order. | ||
735 | * We want to prevent merge between freepages on isolate | ||
736 | * pageblock and normal pageblock. Without this, pageblock | ||
737 | * isolation could cause incorrect freepage or CMA accounting. | ||
738 | * | ||
739 | * We don't want to hit this code for the more frequent | ||
740 | * low-order merging. | ||
741 | */ | ||
742 | if (unlikely(has_isolate_pageblock(zone))) { | ||
743 | int buddy_mt; | ||
744 | |||
745 | buddy_idx = __find_buddy_index(page_idx, order); | ||
746 | buddy = page + (buddy_idx - page_idx); | ||
747 | buddy_mt = get_pageblock_migratetype(buddy); | ||
748 | |||
749 | if (migratetype != buddy_mt | ||
750 | && (is_migrate_isolate(migratetype) || | ||
751 | is_migrate_isolate(buddy_mt))) | ||
752 | goto done_merging; | ||
753 | } | ||
754 | max_order++; | ||
755 | goto continue_merging; | ||
756 | } | ||
757 | |||
758 | done_merging: | ||
707 | set_page_order(page, order); | 759 | set_page_order(page, order); |
708 | 760 | ||
709 | /* | 761 | /* |
@@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page) | |||
741 | bad_reason = "nonzero mapcount"; | 793 | bad_reason = "nonzero mapcount"; |
742 | if (unlikely(page->mapping != NULL)) | 794 | if (unlikely(page->mapping != NULL)) |
743 | bad_reason = "non-NULL mapping"; | 795 | bad_reason = "non-NULL mapping"; |
744 | if (unlikely(atomic_read(&page->_count) != 0)) | 796 | if (unlikely(page_ref_count(page) != 0)) |
745 | bad_reason = "nonzero _count"; | 797 | bad_reason = "nonzero _count"; |
746 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { | 798 | if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { |
747 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; | 799 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
@@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
1002 | PAGE_SIZE << order); | 1054 | PAGE_SIZE << order); |
1003 | } | 1055 | } |
1004 | arch_free_page(page, order); | 1056 | arch_free_page(page, order); |
1057 | kernel_poison_pages(page, 1 << order, 0); | ||
1005 | kernel_map_pages(page, 1 << order, 0); | 1058 | kernel_map_pages(page, 1 << order, 0); |
1006 | 1059 | ||
1007 | return true; | 1060 | return true; |
@@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, | |||
1104 | return __free_pages_boot_core(page, pfn, order); | 1157 | return __free_pages_boot_core(page, pfn, order); |
1105 | } | 1158 | } |
1106 | 1159 | ||
1160 | /* | ||
1161 | * Check that the whole (or subset of) a pageblock given by the interval of | ||
1162 | * [start_pfn, end_pfn) is valid and within the same zone, before scanning it | ||
1163 | * with the migration of free compaction scanner. The scanners then need to | ||
1164 | * use only pfn_valid_within() check for arches that allow holes within | ||
1165 | * pageblocks. | ||
1166 | * | ||
1167 | * Return struct page pointer of start_pfn, or NULL if checks were not passed. | ||
1168 | * | ||
1169 | * It's possible on some configurations to have a setup like node0 node1 node0 | ||
1170 | * i.e. it's possible that all pages within a zones range of pages do not | ||
1171 | * belong to a single zone. We assume that a border between node0 and node1 | ||
1172 | * can occur within a single pageblock, but not a node0 node1 node0 | ||
1173 | * interleaving within a single pageblock. It is therefore sufficient to check | ||
1174 | * the first and last page of a pageblock and avoid checking each individual | ||
1175 | * page in a pageblock. | ||
1176 | */ | ||
1177 | struct page *__pageblock_pfn_to_page(unsigned long start_pfn, | ||
1178 | unsigned long end_pfn, struct zone *zone) | ||
1179 | { | ||
1180 | struct page *start_page; | ||
1181 | struct page *end_page; | ||
1182 | |||
1183 | /* end_pfn is one past the range we are checking */ | ||
1184 | end_pfn--; | ||
1185 | |||
1186 | if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) | ||
1187 | return NULL; | ||
1188 | |||
1189 | start_page = pfn_to_page(start_pfn); | ||
1190 | |||
1191 | if (page_zone(start_page) != zone) | ||
1192 | return NULL; | ||
1193 | |||
1194 | end_page = pfn_to_page(end_pfn); | ||
1195 | |||
1196 | /* This gives a shorter code than deriving page_zone(end_page) */ | ||
1197 | if (page_zone_id(start_page) != page_zone_id(end_page)) | ||
1198 | return NULL; | ||
1199 | |||
1200 | return start_page; | ||
1201 | } | ||
1202 | |||
1203 | void set_zone_contiguous(struct zone *zone) | ||
1204 | { | ||
1205 | unsigned long block_start_pfn = zone->zone_start_pfn; | ||
1206 | unsigned long block_end_pfn; | ||
1207 | |||
1208 | block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); | ||
1209 | for (; block_start_pfn < zone_end_pfn(zone); | ||
1210 | block_start_pfn = block_end_pfn, | ||
1211 | block_end_pfn += pageblock_nr_pages) { | ||
1212 | |||
1213 | block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); | ||
1214 | |||
1215 | if (!__pageblock_pfn_to_page(block_start_pfn, | ||
1216 | block_end_pfn, zone)) | ||
1217 | return; | ||
1218 | } | ||
1219 | |||
1220 | /* We confirm that there is no hole */ | ||
1221 | zone->contiguous = true; | ||
1222 | } | ||
1223 | |||
1224 | void clear_zone_contiguous(struct zone *zone) | ||
1225 | { | ||
1226 | zone->contiguous = false; | ||
1227 | } | ||
1228 | |||
1107 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | 1229 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
1108 | static void __init deferred_free_range(struct page *page, | 1230 | static void __init deferred_free_range(struct page *page, |
1109 | unsigned long pfn, int nr_pages) | 1231 | unsigned long pfn, int nr_pages) |
@@ -1254,9 +1376,13 @@ free_range: | |||
1254 | pgdat_init_report_one_done(); | 1376 | pgdat_init_report_one_done(); |
1255 | return 0; | 1377 | return 0; |
1256 | } | 1378 | } |
1379 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ | ||
1257 | 1380 | ||
1258 | void __init page_alloc_init_late(void) | 1381 | void __init page_alloc_init_late(void) |
1259 | { | 1382 | { |
1383 | struct zone *zone; | ||
1384 | |||
1385 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | ||
1260 | int nid; | 1386 | int nid; |
1261 | 1387 | ||
1262 | /* There will be num_node_state(N_MEMORY) threads */ | 1388 | /* There will be num_node_state(N_MEMORY) threads */ |
@@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void) | |||
1270 | 1396 | ||
1271 | /* Reinit limits that are based on free pages after the kernel is up */ | 1397 | /* Reinit limits that are based on free pages after the kernel is up */ |
1272 | files_maxfiles_init(); | 1398 | files_maxfiles_init(); |
1399 | #endif | ||
1400 | |||
1401 | for_each_populated_zone(zone) | ||
1402 | set_zone_contiguous(zone); | ||
1273 | } | 1403 | } |
1274 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ | ||
1275 | 1404 | ||
1276 | #ifdef CONFIG_CMA | 1405 | #ifdef CONFIG_CMA |
1277 | /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ | 1406 | /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ |
@@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page) | |||
1360 | bad_reason = "nonzero mapcount"; | 1489 | bad_reason = "nonzero mapcount"; |
1361 | if (unlikely(page->mapping != NULL)) | 1490 | if (unlikely(page->mapping != NULL)) |
1362 | bad_reason = "non-NULL mapping"; | 1491 | bad_reason = "non-NULL mapping"; |
1363 | if (unlikely(atomic_read(&page->_count) != 0)) | 1492 | if (unlikely(page_ref_count(page) != 0)) |
1364 | bad_reason = "nonzero _count"; | 1493 | bad_reason = "nonzero _count"; |
1365 | if (unlikely(page->flags & __PG_HWPOISON)) { | 1494 | if (unlikely(page->flags & __PG_HWPOISON)) { |
1366 | bad_reason = "HWPoisoned (hardware-corrupted)"; | 1495 | bad_reason = "HWPoisoned (hardware-corrupted)"; |
@@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page) | |||
1381 | return 0; | 1510 | return 0; |
1382 | } | 1511 | } |
1383 | 1512 | ||
1513 | static inline bool free_pages_prezeroed(bool poisoned) | ||
1514 | { | ||
1515 | return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && | ||
1516 | page_poisoning_enabled() && poisoned; | ||
1517 | } | ||
1518 | |||
1384 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | 1519 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
1385 | int alloc_flags) | 1520 | int alloc_flags) |
1386 | { | 1521 | { |
1387 | int i; | 1522 | int i; |
1523 | bool poisoned = true; | ||
1388 | 1524 | ||
1389 | for (i = 0; i < (1 << order); i++) { | 1525 | for (i = 0; i < (1 << order); i++) { |
1390 | struct page *p = page + i; | 1526 | struct page *p = page + i; |
1391 | if (unlikely(check_new_page(p))) | 1527 | if (unlikely(check_new_page(p))) |
1392 | return 1; | 1528 | return 1; |
1529 | if (poisoned) | ||
1530 | poisoned &= page_is_poisoned(p); | ||
1393 | } | 1531 | } |
1394 | 1532 | ||
1395 | set_page_private(page, 0); | 1533 | set_page_private(page, 0); |
@@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | |||
1397 | 1535 | ||
1398 | arch_alloc_page(page, order); | 1536 | arch_alloc_page(page, order); |
1399 | kernel_map_pages(page, 1 << order, 1); | 1537 | kernel_map_pages(page, 1 << order, 1); |
1538 | kernel_poison_pages(page, 1 << order, 1); | ||
1400 | kasan_alloc_pages(page, order); | 1539 | kasan_alloc_pages(page, order); |
1401 | 1540 | ||
1402 | if (gfp_flags & __GFP_ZERO) | 1541 | if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) |
1403 | for (i = 0; i < (1 << order); i++) | 1542 | for (i = 0; i < (1 << order); i++) |
1404 | clear_highpage(page + i); | 1543 | clear_highpage(page + i); |
1405 | 1544 | ||
@@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
2238 | list_del(&page->lru); | 2377 | list_del(&page->lru); |
2239 | pcp->count--; | 2378 | pcp->count--; |
2240 | } else { | 2379 | } else { |
2241 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | 2380 | /* |
2242 | /* | 2381 | * We most definitely don't want callers attempting to |
2243 | * __GFP_NOFAIL is not to be used in new code. | 2382 | * allocate greater than order-1 page units with __GFP_NOFAIL. |
2244 | * | 2383 | */ |
2245 | * All __GFP_NOFAIL callers should be fixed so that they | 2384 | WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); |
2246 | * properly detect and handle allocation failures. | ||
2247 | * | ||
2248 | * We most definitely don't want callers attempting to | ||
2249 | * allocate greater than order-1 page units with | ||
2250 | * __GFP_NOFAIL. | ||
2251 | */ | ||
2252 | WARN_ON_ONCE(order > 1); | ||
2253 | } | ||
2254 | spin_lock_irqsave(&zone->lock, flags); | 2385 | spin_lock_irqsave(&zone->lock, flags); |
2255 | 2386 | ||
2256 | page = NULL; | 2387 | page = NULL; |
@@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) | |||
2690 | va_end(args); | 2821 | va_end(args); |
2691 | } | 2822 | } |
2692 | 2823 | ||
2693 | pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", | 2824 | pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", |
2694 | current->comm, order, gfp_mask); | 2825 | current->comm, order, gfp_mask, &gfp_mask); |
2695 | |||
2696 | dump_stack(); | 2826 | dump_stack(); |
2697 | if (!should_suppress_show_mem()) | 2827 | if (!should_suppress_show_mem()) |
2698 | show_mem(filter); | 2828 | show_mem(filter); |
@@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2748 | * XXX: Page reclaim didn't yield anything, | 2878 | * XXX: Page reclaim didn't yield anything, |
2749 | * and the OOM killer can't be invoked, but | 2879 | * and the OOM killer can't be invoked, but |
2750 | * keep looping as per tradition. | 2880 | * keep looping as per tradition. |
2881 | * | ||
2882 | * But do not keep looping if oom_killer_disable() | ||
2883 | * was already called, for the system is trying to | ||
2884 | * enter a quiescent state during suspend. | ||
2751 | */ | 2885 | */ |
2752 | *did_some_progress = 1; | 2886 | *did_some_progress = !oom_killer_disabled; |
2753 | goto out; | 2887 | goto out; |
2754 | } | 2888 | } |
2755 | if (pm_suspended_storage()) | 2889 | if (pm_suspended_storage()) |
@@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3008 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) | 3142 | (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) |
3009 | gfp_mask &= ~__GFP_ATOMIC; | 3143 | gfp_mask &= ~__GFP_ATOMIC; |
3010 | 3144 | ||
3011 | /* | ||
3012 | * If this allocation cannot block and it is for a specific node, then | ||
3013 | * fail early. There's no need to wakeup kswapd or retry for a | ||
3014 | * speculative node-specific allocation. | ||
3015 | */ | ||
3016 | if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) | ||
3017 | goto nopage; | ||
3018 | |||
3019 | retry: | 3145 | retry: |
3020 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) | 3146 | if (gfp_mask & __GFP_KSWAPD_RECLAIM) |
3021 | wake_all_kswapds(order, ac); | 3147 | wake_all_kswapds(order, ac); |
@@ -3372,7 +3498,7 @@ refill: | |||
3372 | /* Even if we own the page, we do not use atomic_set(). | 3498 | /* Even if we own the page, we do not use atomic_set(). |
3373 | * This would break get_page_unless_zero() users. | 3499 | * This would break get_page_unless_zero() users. |
3374 | */ | 3500 | */ |
3375 | atomic_add(size - 1, &page->_count); | 3501 | page_ref_add(page, size - 1); |
3376 | 3502 | ||
3377 | /* reset page count bias and offset to start of new frag */ | 3503 | /* reset page count bias and offset to start of new frag */ |
3378 | nc->pfmemalloc = page_is_pfmemalloc(page); | 3504 | nc->pfmemalloc = page_is_pfmemalloc(page); |
@@ -3384,7 +3510,7 @@ refill: | |||
3384 | if (unlikely(offset < 0)) { | 3510 | if (unlikely(offset < 0)) { |
3385 | page = virt_to_page(nc->va); | 3511 | page = virt_to_page(nc->va); |
3386 | 3512 | ||
3387 | if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) | 3513 | if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) |
3388 | goto refill; | 3514 | goto refill; |
3389 | 3515 | ||
3390 | #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) | 3516 | #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) |
@@ -3392,7 +3518,7 @@ refill: | |||
3392 | size = nc->size; | 3518 | size = nc->size; |
3393 | #endif | 3519 | #endif |
3394 | /* OK, page count is 0, we can safely set it */ | 3520 | /* OK, page count is 0, we can safely set it */ |
3395 | atomic_set(&page->_count, size); | 3521 | set_page_count(page, size); |
3396 | 3522 | ||
3397 | /* reset page count bias and offset to start of new frag */ | 3523 | /* reset page count bias and offset to start of new frag */ |
3398 | nc->pagecnt_bias = size; | 3524 | nc->pagecnt_bias = size; |
@@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone) | |||
3603 | printk("Node %d ", zone_to_nid(zone)); | 3729 | printk("Node %d ", zone_to_nid(zone)); |
3604 | } | 3730 | } |
3605 | 3731 | ||
3732 | long si_mem_available(void) | ||
3733 | { | ||
3734 | long available; | ||
3735 | unsigned long pagecache; | ||
3736 | unsigned long wmark_low = 0; | ||
3737 | unsigned long pages[NR_LRU_LISTS]; | ||
3738 | struct zone *zone; | ||
3739 | int lru; | ||
3740 | |||
3741 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) | ||
3742 | pages[lru] = global_page_state(NR_LRU_BASE + lru); | ||
3743 | |||
3744 | for_each_zone(zone) | ||
3745 | wmark_low += zone->watermark[WMARK_LOW]; | ||
3746 | |||
3747 | /* | ||
3748 | * Estimate the amount of memory available for userspace allocations, | ||
3749 | * without causing swapping. | ||
3750 | */ | ||
3751 | available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; | ||
3752 | |||
3753 | /* | ||
3754 | * Not all the page cache can be freed, otherwise the system will | ||
3755 | * start swapping. Assume at least half of the page cache, or the | ||
3756 | * low watermark worth of cache, needs to stay. | ||
3757 | */ | ||
3758 | pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; | ||
3759 | pagecache -= min(pagecache / 2, wmark_low); | ||
3760 | available += pagecache; | ||
3761 | |||
3762 | /* | ||
3763 | * Part of the reclaimable slab consists of items that are in use, | ||
3764 | * and cannot be freed. Cap this estimate at the low watermark. | ||
3765 | */ | ||
3766 | available += global_page_state(NR_SLAB_RECLAIMABLE) - | ||
3767 | min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); | ||
3768 | |||
3769 | if (available < 0) | ||
3770 | available = 0; | ||
3771 | return available; | ||
3772 | } | ||
3773 | EXPORT_SYMBOL_GPL(si_mem_available); | ||
3774 | |||
3606 | void si_meminfo(struct sysinfo *val) | 3775 | void si_meminfo(struct sysinfo *val) |
3607 | { | 3776 | { |
3608 | val->totalram = totalram_pages; | 3777 | val->totalram = totalram_pages; |
@@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s) | |||
3935 | } else if (*s == 'z' || *s == 'Z') { | 4104 | } else if (*s == 'z' || *s == 'Z') { |
3936 | user_zonelist_order = ZONELIST_ORDER_ZONE; | 4105 | user_zonelist_order = ZONELIST_ORDER_ZONE; |
3937 | } else { | 4106 | } else { |
3938 | printk(KERN_WARNING | 4107 | pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); |
3939 | "Ignoring invalid numa_zonelist_order value: " | ||
3940 | "%s\n", s); | ||
3941 | return -EINVAL; | 4108 | return -EINVAL; |
3942 | } | 4109 | } |
3943 | return 0; | 4110 | return 0; |
@@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
4401 | else | 4568 | else |
4402 | page_group_by_mobility_disabled = 0; | 4569 | page_group_by_mobility_disabled = 0; |
4403 | 4570 | ||
4404 | pr_info("Built %i zonelists in %s order, mobility grouping %s. " | 4571 | pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", |
4405 | "Total pages: %ld\n", | 4572 | nr_online_nodes, |
4406 | nr_online_nodes, | 4573 | zonelist_order_name[current_zonelist_order], |
4407 | zonelist_order_name[current_zonelist_order], | 4574 | page_group_by_mobility_disabled ? "off" : "on", |
4408 | page_group_by_mobility_disabled ? "off" : "on", | 4575 | vm_total_pages); |
4409 | vm_total_pages); | ||
4410 | #ifdef CONFIG_NUMA | 4576 | #ifdef CONFIG_NUMA |
4411 | pr_info("Policy zone: %s\n", zone_names[policy_zone]); | 4577 | pr_info("Policy zone: %s\n", zone_names[policy_zone]); |
4412 | #endif | 4578 | #endif |
@@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4491 | pg_data_t *pgdat = NODE_DATA(nid); | 4657 | pg_data_t *pgdat = NODE_DATA(nid); |
4492 | unsigned long pfn; | 4658 | unsigned long pfn; |
4493 | unsigned long nr_initialised = 0; | 4659 | unsigned long nr_initialised = 0; |
4660 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
4661 | struct memblock_region *r = NULL, *tmp; | ||
4662 | #endif | ||
4494 | 4663 | ||
4495 | if (highest_memmap_pfn < end_pfn - 1) | 4664 | if (highest_memmap_pfn < end_pfn - 1) |
4496 | highest_memmap_pfn = end_pfn - 1; | 4665 | highest_memmap_pfn = end_pfn - 1; |
@@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
4504 | 4673 | ||
4505 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 4674 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
4506 | /* | 4675 | /* |
4507 | * There can be holes in boot-time mem_map[]s | 4676 | * There can be holes in boot-time mem_map[]s handed to this |
4508 | * handed to this function. They do not | 4677 | * function. They do not exist on hotplugged memory. |
4509 | * exist on hotplugged memory. | 4678 | */ |
4679 | if (context != MEMMAP_EARLY) | ||
4680 | goto not_early; | ||
4681 | |||
4682 | if (!early_pfn_valid(pfn)) | ||
4683 | continue; | ||
4684 | if (!early_pfn_in_nid(pfn, nid)) | ||
4685 | continue; | ||
4686 | if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) | ||
4687 | break; | ||
4688 | |||
4689 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | ||
4690 | /* | ||
4691 | * If not mirrored_kernelcore and ZONE_MOVABLE exists, range | ||
4692 | * from zone_movable_pfn[nid] to end of each node should be | ||
4693 | * ZONE_MOVABLE not ZONE_NORMAL. skip it. | ||
4510 | */ | 4694 | */ |
4511 | if (context == MEMMAP_EARLY) { | 4695 | if (!mirrored_kernelcore && zone_movable_pfn[nid]) |
4512 | if (!early_pfn_valid(pfn)) | 4696 | if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) |
4513 | continue; | 4697 | continue; |
4514 | if (!early_pfn_in_nid(pfn, nid)) | 4698 | |
4699 | /* | ||
4700 | * Check given memblock attribute by firmware which can affect | ||
4701 | * kernel memory layout. If zone==ZONE_MOVABLE but memory is | ||
4702 | * mirrored, it's an overlapped memmap init. skip it. | ||
4703 | */ | ||
4704 | if (mirrored_kernelcore && zone == ZONE_MOVABLE) { | ||
4705 | if (!r || pfn >= memblock_region_memory_end_pfn(r)) { | ||
4706 | for_each_memblock(memory, tmp) | ||
4707 | if (pfn < memblock_region_memory_end_pfn(tmp)) | ||
4708 | break; | ||
4709 | r = tmp; | ||
4710 | } | ||
4711 | if (pfn >= memblock_region_memory_base_pfn(r) && | ||
4712 | memblock_is_mirror(r)) { | ||
4713 | /* already initialized as NORMAL */ | ||
4714 | pfn = memblock_region_memory_end_pfn(r); | ||
4515 | continue; | 4715 | continue; |
4516 | if (!update_defer_init(pgdat, pfn, end_pfn, | 4716 | } |
4517 | &nr_initialised)) | ||
4518 | break; | ||
4519 | } | 4717 | } |
4718 | #endif | ||
4520 | 4719 | ||
4720 | not_early: | ||
4521 | /* | 4721 | /* |
4522 | * Mark the block movable so that blocks are reserved for | 4722 | * Mark the block movable so that blocks are reserved for |
4523 | * movable at startup. This will force kernel allocations | 4723 | * movable at startup. This will force kernel allocations |
@@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, | |||
4934 | *zone_end_pfn = min(node_end_pfn, | 5134 | *zone_end_pfn = min(node_end_pfn, |
4935 | arch_zone_highest_possible_pfn[movable_zone]); | 5135 | arch_zone_highest_possible_pfn[movable_zone]); |
4936 | 5136 | ||
4937 | /* Adjust for ZONE_MOVABLE starting within this range */ | ||
4938 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && | ||
4939 | *zone_end_pfn > zone_movable_pfn[nid]) { | ||
4940 | *zone_end_pfn = zone_movable_pfn[nid]; | ||
4941 | |||
4942 | /* Check if this whole range is within ZONE_MOVABLE */ | 5137 | /* Check if this whole range is within ZONE_MOVABLE */ |
4943 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) | 5138 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) |
4944 | *zone_start_pfn = *zone_end_pfn; | 5139 | *zone_start_pfn = *zone_end_pfn; |
@@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
4953 | unsigned long zone_type, | 5148 | unsigned long zone_type, |
4954 | unsigned long node_start_pfn, | 5149 | unsigned long node_start_pfn, |
4955 | unsigned long node_end_pfn, | 5150 | unsigned long node_end_pfn, |
5151 | unsigned long *zone_start_pfn, | ||
5152 | unsigned long *zone_end_pfn, | ||
4956 | unsigned long *ignored) | 5153 | unsigned long *ignored) |
4957 | { | 5154 | { |
4958 | unsigned long zone_start_pfn, zone_end_pfn; | ||
4959 | |||
4960 | /* When hotadd a new node from cpu_up(), the node should be empty */ | 5155 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
4961 | if (!node_start_pfn && !node_end_pfn) | 5156 | if (!node_start_pfn && !node_end_pfn) |
4962 | return 0; | 5157 | return 0; |
4963 | 5158 | ||
4964 | /* Get the start and end of the zone */ | 5159 | /* Get the start and end of the zone */ |
4965 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 5160 | *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
4966 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 5161 | *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
4967 | adjust_zone_range_for_zone_movable(nid, zone_type, | 5162 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4968 | node_start_pfn, node_end_pfn, | 5163 | node_start_pfn, node_end_pfn, |
4969 | &zone_start_pfn, &zone_end_pfn); | 5164 | zone_start_pfn, zone_end_pfn); |
4970 | 5165 | ||
4971 | /* Check that this node has pages within the zone's required range */ | 5166 | /* Check that this node has pages within the zone's required range */ |
4972 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | 5167 | if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) |
4973 | return 0; | 5168 | return 0; |
4974 | 5169 | ||
4975 | /* Move the zone boundaries inside the node if necessary */ | 5170 | /* Move the zone boundaries inside the node if necessary */ |
4976 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | 5171 | *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); |
4977 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | 5172 | *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); |
4978 | 5173 | ||
4979 | /* Return the spanned pages */ | 5174 | /* Return the spanned pages */ |
4980 | return zone_end_pfn - zone_start_pfn; | 5175 | return *zone_end_pfn - *zone_start_pfn; |
4981 | } | 5176 | } |
4982 | 5177 | ||
4983 | /* | 5178 | /* |
@@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
5023 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; | 5218 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
5024 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 5219 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
5025 | unsigned long zone_start_pfn, zone_end_pfn; | 5220 | unsigned long zone_start_pfn, zone_end_pfn; |
5221 | unsigned long nr_absent; | ||
5026 | 5222 | ||
5027 | /* When hotadd a new node from cpu_up(), the node should be empty */ | 5223 | /* When hotadd a new node from cpu_up(), the node should be empty */ |
5028 | if (!node_start_pfn && !node_end_pfn) | 5224 | if (!node_start_pfn && !node_end_pfn) |
@@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
5034 | adjust_zone_range_for_zone_movable(nid, zone_type, | 5230 | adjust_zone_range_for_zone_movable(nid, zone_type, |
5035 | node_start_pfn, node_end_pfn, | 5231 | node_start_pfn, node_end_pfn, |
5036 | &zone_start_pfn, &zone_end_pfn); | 5232 | &zone_start_pfn, &zone_end_pfn); |
5037 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 5233 | nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
5234 | |||
5235 | /* | ||
5236 | * ZONE_MOVABLE handling. | ||
5237 | * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages | ||
5238 | * and vice versa. | ||
5239 | */ | ||
5240 | if (zone_movable_pfn[nid]) { | ||
5241 | if (mirrored_kernelcore) { | ||
5242 | unsigned long start_pfn, end_pfn; | ||
5243 | struct memblock_region *r; | ||
5244 | |||
5245 | for_each_memblock(memory, r) { | ||
5246 | start_pfn = clamp(memblock_region_memory_base_pfn(r), | ||
5247 | zone_start_pfn, zone_end_pfn); | ||
5248 | end_pfn = clamp(memblock_region_memory_end_pfn(r), | ||
5249 | zone_start_pfn, zone_end_pfn); | ||
5250 | |||
5251 | if (zone_type == ZONE_MOVABLE && | ||
5252 | memblock_is_mirror(r)) | ||
5253 | nr_absent += end_pfn - start_pfn; | ||
5254 | |||
5255 | if (zone_type == ZONE_NORMAL && | ||
5256 | !memblock_is_mirror(r)) | ||
5257 | nr_absent += end_pfn - start_pfn; | ||
5258 | } | ||
5259 | } else { | ||
5260 | if (zone_type == ZONE_NORMAL) | ||
5261 | nr_absent += node_end_pfn - zone_movable_pfn[nid]; | ||
5262 | } | ||
5263 | } | ||
5264 | |||
5265 | return nr_absent; | ||
5038 | } | 5266 | } |
5039 | 5267 | ||
5040 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5268 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
@@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
5042 | unsigned long zone_type, | 5270 | unsigned long zone_type, |
5043 | unsigned long node_start_pfn, | 5271 | unsigned long node_start_pfn, |
5044 | unsigned long node_end_pfn, | 5272 | unsigned long node_end_pfn, |
5273 | unsigned long *zone_start_pfn, | ||
5274 | unsigned long *zone_end_pfn, | ||
5045 | unsigned long *zones_size) | 5275 | unsigned long *zones_size) |
5046 | { | 5276 | { |
5277 | unsigned int zone; | ||
5278 | |||
5279 | *zone_start_pfn = node_start_pfn; | ||
5280 | for (zone = 0; zone < zone_type; zone++) | ||
5281 | *zone_start_pfn += zones_size[zone]; | ||
5282 | |||
5283 | *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; | ||
5284 | |||
5047 | return zones_size[zone_type]; | 5285 | return zones_size[zone_type]; |
5048 | } | 5286 | } |
5049 | 5287 | ||
@@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
5072 | 5310 | ||
5073 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5311 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5074 | struct zone *zone = pgdat->node_zones + i; | 5312 | struct zone *zone = pgdat->node_zones + i; |
5313 | unsigned long zone_start_pfn, zone_end_pfn; | ||
5075 | unsigned long size, real_size; | 5314 | unsigned long size, real_size; |
5076 | 5315 | ||
5077 | size = zone_spanned_pages_in_node(pgdat->node_id, i, | 5316 | size = zone_spanned_pages_in_node(pgdat->node_id, i, |
5078 | node_start_pfn, | 5317 | node_start_pfn, |
5079 | node_end_pfn, | 5318 | node_end_pfn, |
5319 | &zone_start_pfn, | ||
5320 | &zone_end_pfn, | ||
5080 | zones_size); | 5321 | zones_size); |
5081 | real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, | 5322 | real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, |
5082 | node_start_pfn, node_end_pfn, | 5323 | node_start_pfn, node_end_pfn, |
5083 | zholes_size); | 5324 | zholes_size); |
5325 | if (size) | ||
5326 | zone->zone_start_pfn = zone_start_pfn; | ||
5327 | else | ||
5328 | zone->zone_start_pfn = 0; | ||
5084 | zone->spanned_pages = size; | 5329 | zone->spanned_pages = size; |
5085 | zone->present_pages = real_size; | 5330 | zone->present_pages = real_size; |
5086 | 5331 | ||
@@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5201 | { | 5446 | { |
5202 | enum zone_type j; | 5447 | enum zone_type j; |
5203 | int nid = pgdat->node_id; | 5448 | int nid = pgdat->node_id; |
5204 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | ||
5205 | int ret; | 5449 | int ret; |
5206 | 5450 | ||
5207 | pgdat_resize_init(pgdat); | 5451 | pgdat_resize_init(pgdat); |
@@ -5217,11 +5461,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5217 | #endif | 5461 | #endif |
5218 | init_waitqueue_head(&pgdat->kswapd_wait); | 5462 | init_waitqueue_head(&pgdat->kswapd_wait); |
5219 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 5463 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
5464 | #ifdef CONFIG_COMPACTION | ||
5465 | init_waitqueue_head(&pgdat->kcompactd_wait); | ||
5466 | #endif | ||
5220 | pgdat_page_ext_init(pgdat); | 5467 | pgdat_page_ext_init(pgdat); |
5221 | 5468 | ||
5222 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5469 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5223 | struct zone *zone = pgdat->node_zones + j; | 5470 | struct zone *zone = pgdat->node_zones + j; |
5224 | unsigned long size, realsize, freesize, memmap_pages; | 5471 | unsigned long size, realsize, freesize, memmap_pages; |
5472 | unsigned long zone_start_pfn = zone->zone_start_pfn; | ||
5225 | 5473 | ||
5226 | size = zone->spanned_pages; | 5474 | size = zone->spanned_pages; |
5227 | realsize = freesize = zone->present_pages; | 5475 | realsize = freesize = zone->present_pages; |
@@ -5240,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5240 | " %s zone: %lu pages used for memmap\n", | 5488 | " %s zone: %lu pages used for memmap\n", |
5241 | zone_names[j], memmap_pages); | 5489 | zone_names[j], memmap_pages); |
5242 | } else | 5490 | } else |
5243 | printk(KERN_WARNING | 5491 | pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", |
5244 | " %s zone: %lu pages exceeds freesize %lu\n", | ||
5245 | zone_names[j], memmap_pages, freesize); | 5492 | zone_names[j], memmap_pages, freesize); |
5246 | } | 5493 | } |
5247 | 5494 | ||
@@ -5290,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) | |||
5290 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); | 5537 | ret = init_currently_empty_zone(zone, zone_start_pfn, size); |
5291 | BUG_ON(ret); | 5538 | BUG_ON(ret); |
5292 | memmap_init(size, nid, j, zone_start_pfn); | 5539 | memmap_init(size, nid, j, zone_start_pfn); |
5293 | zone_start_pfn += size; | ||
5294 | } | 5540 | } |
5295 | } | 5541 | } |
5296 | 5542 | ||
@@ -5358,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5358 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, | 5604 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5359 | (u64)start_pfn << PAGE_SHIFT, | 5605 | (u64)start_pfn << PAGE_SHIFT, |
5360 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); | 5606 | end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); |
5607 | #else | ||
5608 | start_pfn = node_start_pfn; | ||
5361 | #endif | 5609 | #endif |
5362 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5610 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5363 | zones_size, zholes_size); | 5611 | zones_size, zholes_size); |
@@ -5448,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) | |||
5448 | min_pfn = min(min_pfn, start_pfn); | 5696 | min_pfn = min(min_pfn, start_pfn); |
5449 | 5697 | ||
5450 | if (min_pfn == ULONG_MAX) { | 5698 | if (min_pfn == ULONG_MAX) { |
5451 | printk(KERN_WARNING | 5699 | pr_warn("Could not find start_pfn for node %d\n", nid); |
5452 | "Could not find start_pfn for node %d\n", nid); | ||
5453 | return 0; | 5700 | return 0; |
5454 | } | 5701 | } |
5455 | 5702 | ||
@@ -5529,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
5529 | } | 5776 | } |
5530 | 5777 | ||
5531 | /* | 5778 | /* |
5779 | * If kernelcore=mirror is specified, ignore movablecore option | ||
5780 | */ | ||
5781 | if (mirrored_kernelcore) { | ||
5782 | bool mem_below_4gb_not_mirrored = false; | ||
5783 | |||
5784 | for_each_memblock(memory, r) { | ||
5785 | if (memblock_is_mirror(r)) | ||
5786 | continue; | ||
5787 | |||
5788 | nid = r->nid; | ||
5789 | |||
5790 | usable_startpfn = memblock_region_memory_base_pfn(r); | ||
5791 | |||
5792 | if (usable_startpfn < 0x100000) { | ||
5793 | mem_below_4gb_not_mirrored = true; | ||
5794 | continue; | ||
5795 | } | ||
5796 | |||
5797 | zone_movable_pfn[nid] = zone_movable_pfn[nid] ? | ||
5798 | min(usable_startpfn, zone_movable_pfn[nid]) : | ||
5799 | usable_startpfn; | ||
5800 | } | ||
5801 | |||
5802 | if (mem_below_4gb_not_mirrored) | ||
5803 | pr_warn("This configuration results in unmirrored kernel memory."); | ||
5804 | |||
5805 | goto out2; | ||
5806 | } | ||
5807 | |||
5808 | /* | ||
5532 | * If movablecore=nn[KMG] was specified, calculate what size of | 5809 | * If movablecore=nn[KMG] was specified, calculate what size of |
5533 | * kernelcore that corresponds so that memory usable for | 5810 | * kernelcore that corresponds so that memory usable for |
5534 | * any allocation type is evenly spread. If both kernelcore | 5811 | * any allocation type is evenly spread. If both kernelcore |
@@ -5788,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) | |||
5788 | */ | 6065 | */ |
5789 | static int __init cmdline_parse_kernelcore(char *p) | 6066 | static int __init cmdline_parse_kernelcore(char *p) |
5790 | { | 6067 | { |
6068 | /* parse kernelcore=mirror */ | ||
6069 | if (parse_option_str(p, "mirror")) { | ||
6070 | mirrored_kernelcore = true; | ||
6071 | return 0; | ||
6072 | } | ||
6073 | |||
5791 | return cmdline_parse_core(p, &required_kernelcore); | 6074 | return cmdline_parse_core(p, &required_kernelcore); |
5792 | } | 6075 | } |
5793 | 6076 | ||
@@ -5885,22 +6168,21 @@ void __init mem_init_print_info(const char *str) | |||
5885 | 6168 | ||
5886 | #undef adj_init_size | 6169 | #undef adj_init_size |
5887 | 6170 | ||
5888 | pr_info("Memory: %luK/%luK available " | 6171 | pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" |
5889 | "(%luK kernel code, %luK rwdata, %luK rodata, " | ||
5890 | "%luK init, %luK bss, %luK reserved, %luK cma-reserved" | ||
5891 | #ifdef CONFIG_HIGHMEM | 6172 | #ifdef CONFIG_HIGHMEM |
5892 | ", %luK highmem" | 6173 | ", %luK highmem" |
5893 | #endif | 6174 | #endif |
5894 | "%s%s)\n", | 6175 | "%s%s)\n", |
5895 | nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), | 6176 | nr_free_pages() << (PAGE_SHIFT - 10), |
5896 | codesize >> 10, datasize >> 10, rosize >> 10, | 6177 | physpages << (PAGE_SHIFT - 10), |
5897 | (init_data_size + init_code_size) >> 10, bss_size >> 10, | 6178 | codesize >> 10, datasize >> 10, rosize >> 10, |
5898 | (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), | 6179 | (init_data_size + init_code_size) >> 10, bss_size >> 10, |
5899 | totalcma_pages << (PAGE_SHIFT-10), | 6180 | (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), |
6181 | totalcma_pages << (PAGE_SHIFT - 10), | ||
5900 | #ifdef CONFIG_HIGHMEM | 6182 | #ifdef CONFIG_HIGHMEM |
5901 | totalhigh_pages << (PAGE_SHIFT-10), | 6183 | totalhigh_pages << (PAGE_SHIFT - 10), |
5902 | #endif | 6184 | #endif |
5903 | str ? ", " : "", str ? str : ""); | 6185 | str ? ", " : "", str ? str : ""); |
5904 | } | 6186 | } |
5905 | 6187 | ||
5906 | /** | 6188 | /** |
@@ -6075,8 +6357,17 @@ static void __setup_per_zone_wmarks(void) | |||
6075 | zone->watermark[WMARK_MIN] = tmp; | 6357 | zone->watermark[WMARK_MIN] = tmp; |
6076 | } | 6358 | } |
6077 | 6359 | ||
6078 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 6360 | /* |
6079 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 6361 | * Set the kswapd watermarks distance according to the |
6362 | * scale factor in proportion to available memory, but | ||
6363 | * ensure a minimum size on small systems. | ||
6364 | */ | ||
6365 | tmp = max_t(u64, tmp >> 2, | ||
6366 | mult_frac(zone->managed_pages, | ||
6367 | watermark_scale_factor, 10000)); | ||
6368 | |||
6369 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; | ||
6370 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; | ||
6080 | 6371 | ||
6081 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, | 6372 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, |
6082 | high_wmark_pages(zone) - low_wmark_pages(zone) - | 6373 | high_wmark_pages(zone) - low_wmark_pages(zone) - |
@@ -6217,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, | |||
6217 | return 0; | 6508 | return 0; |
6218 | } | 6509 | } |
6219 | 6510 | ||
6511 | int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, | ||
6512 | void __user *buffer, size_t *length, loff_t *ppos) | ||
6513 | { | ||
6514 | int rc; | ||
6515 | |||
6516 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
6517 | if (rc) | ||
6518 | return rc; | ||
6519 | |||
6520 | if (write) | ||
6521 | setup_per_zone_wmarks(); | ||
6522 | |||
6523 | return 0; | ||
6524 | } | ||
6525 | |||
6220 | #ifdef CONFIG_NUMA | 6526 | #ifdef CONFIG_NUMA |
6221 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, | 6527 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, |
6222 | void __user *buffer, size_t *length, loff_t *ppos) | 6528 | void __user *buffer, size_t *length, loff_t *ppos) |
@@ -6408,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
6408 | if (!table) | 6714 | if (!table) |
6409 | panic("Failed to allocate %s hash table\n", tablename); | 6715 | panic("Failed to allocate %s hash table\n", tablename); |
6410 | 6716 | ||
6411 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", | 6717 | pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", |
6412 | tablename, | 6718 | tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); |
6413 | (1UL << log2qty), | ||
6414 | ilog2(size) - PAGE_SHIFT, | ||
6415 | size); | ||
6416 | 6719 | ||
6417 | if (_hash_shift) | 6720 | if (_hash_shift) |
6418 | *_hash_shift = log2qty; | 6721 | *_hash_shift = log2qty; |
@@ -6563,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
6563 | * This check already skips compound tails of THP | 6866 | * This check already skips compound tails of THP |
6564 | * because their page->_count is zero at all time. | 6867 | * because their page->_count is zero at all time. |
6565 | */ | 6868 | */ |
6566 | if (!atomic_read(&page->_count)) { | 6869 | if (!page_ref_count(page)) { |
6567 | if (PageBuddy(page)) | 6870 | if (PageBuddy(page)) |
6568 | iter += (1 << page_order(page)) - 1; | 6871 | iter += (1 << page_order(page)) - 1; |
6569 | continue; | 6872 | continue; |
@@ -6913,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6913 | BUG_ON(!PageBuddy(page)); | 7216 | BUG_ON(!PageBuddy(page)); |
6914 | order = page_order(page); | 7217 | order = page_order(page); |
6915 | #ifdef CONFIG_DEBUG_VM | 7218 | #ifdef CONFIG_DEBUG_VM |
6916 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | 7219 | pr_info("remove from free list %lx %d %lx\n", |
6917 | pfn, 1 << order, end_pfn); | 7220 | pfn, 1 << order, end_pfn); |
6918 | #endif | 7221 | #endif |
6919 | list_del(&page->lru); | 7222 | list_del(&page->lru); |
6920 | rmv_page_order(page); | 7223 | rmv_page_order(page); |
@@ -6927,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6927 | } | 7230 | } |
6928 | #endif | 7231 | #endif |
6929 | 7232 | ||
6930 | #ifdef CONFIG_MEMORY_FAILURE | ||
6931 | bool is_free_buddy_page(struct page *page) | 7233 | bool is_free_buddy_page(struct page *page) |
6932 | { | 7234 | { |
6933 | struct zone *zone = page_zone(page); | 7235 | struct zone *zone = page_zone(page); |
@@ -6946,4 +7248,3 @@ bool is_free_buddy_page(struct page *page) | |||
6946 | 7248 | ||
6947 | return order < MAX_ORDER; | 7249 | return order < MAX_ORDER; |
6948 | } | 7250 | } |
6949 | #endif | ||