aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 14:35:36 -0500
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2015-02-10 14:35:36 -0500
commit4ba24fef3eb3b142197135223b90ced2f319cd53 (patch)
treea20c125b27740ec7b4c761b11d801108e1b316b2 /mm/page_alloc.c
parent47c1ffb2b6b630894e9a16442611c056ab21c057 (diff)
parent98a4a59ee31a12105a2b84f5b8b515ac2cb208ef (diff)
Merge branch 'next' into for-linus
Prepare first round of input updates for 3.20.
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c676
1 files changed, 333 insertions, 343 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eee961958021..7633c503a116 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,19 +48,18 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h> 51#include <linux/page_ext.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/compaction.h> 54#include <linux/compaction.h>
55#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 56#include <linux/prefetch.h>
59#include <linux/mm_inline.h> 57#include <linux/mm_inline.h>
60#include <linux/migrate.h> 58#include <linux/migrate.h>
61#include <linux/page-debug-flags.h> 59#include <linux/page_ext.h>
62#include <linux/hugetlb.h> 60#include <linux/hugetlb.h>
63#include <linux/sched/rt.h> 61#include <linux/sched/rt.h>
62#include <linux/page_owner.h>
64 63
65#include <asm/sections.h> 64#include <asm/sections.h>
66#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
@@ -85,6 +84,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
85 */ 84 */
86DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ 85DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
87EXPORT_PER_CPU_SYMBOL(_numa_mem_); 86EXPORT_PER_CPU_SYMBOL(_numa_mem_);
87int _node_numa_mem_[MAX_NUMNODES];
88#endif 88#endif
89 89
90/* 90/*
@@ -111,6 +111,7 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
111 111
112unsigned long totalram_pages __read_mostly; 112unsigned long totalram_pages __read_mostly;
113unsigned long totalreserve_pages __read_mostly; 113unsigned long totalreserve_pages __read_mostly;
114unsigned long totalcma_pages __read_mostly;
114/* 115/*
115 * When calculating the number of globally allowed dirty pages, there 116 * When calculating the number of globally allowed dirty pages, there
116 * is a certain number of per-zone reserves that should not be 117 * is a certain number of per-zone reserves that should not be
@@ -426,6 +427,42 @@ static inline void prep_zero_page(struct page *page, unsigned int order,
426 427
427#ifdef CONFIG_DEBUG_PAGEALLOC 428#ifdef CONFIG_DEBUG_PAGEALLOC
428unsigned int _debug_guardpage_minorder; 429unsigned int _debug_guardpage_minorder;
430bool _debug_pagealloc_enabled __read_mostly;
431bool _debug_guardpage_enabled __read_mostly;
432
433static int __init early_debug_pagealloc(char *buf)
434{
435 if (!buf)
436 return -EINVAL;
437
438 if (strcmp(buf, "on") == 0)
439 _debug_pagealloc_enabled = true;
440
441 return 0;
442}
443early_param("debug_pagealloc", early_debug_pagealloc);
444
445static bool need_debug_guardpage(void)
446{
447 /* If we don't use debug_pagealloc, we don't need guard page */
448 if (!debug_pagealloc_enabled())
449 return false;
450
451 return true;
452}
453
454static void init_debug_guardpage(void)
455{
456 if (!debug_pagealloc_enabled())
457 return;
458
459 _debug_guardpage_enabled = true;
460}
461
462struct page_ext_operations debug_guardpage_ops = {
463 .need = need_debug_guardpage,
464 .init = init_debug_guardpage,
465};
429 466
430static int __init debug_guardpage_minorder_setup(char *buf) 467static int __init debug_guardpage_minorder_setup(char *buf)
431{ 468{
@@ -441,18 +478,44 @@ static int __init debug_guardpage_minorder_setup(char *buf)
441} 478}
442__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); 479__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
443 480
444static inline void set_page_guard_flag(struct page *page) 481static inline void set_page_guard(struct zone *zone, struct page *page,
482 unsigned int order, int migratetype)
445{ 483{
446 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 484 struct page_ext *page_ext;
485
486 if (!debug_guardpage_enabled())
487 return;
488
489 page_ext = lookup_page_ext(page);
490 __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
491
492 INIT_LIST_HEAD(&page->lru);
493 set_page_private(page, order);
494 /* Guard pages are not available for any usage */
495 __mod_zone_freepage_state(zone, -(1 << order), migratetype);
447} 496}
448 497
449static inline void clear_page_guard_flag(struct page *page) 498static inline void clear_page_guard(struct zone *zone, struct page *page,
499 unsigned int order, int migratetype)
450{ 500{
451 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); 501 struct page_ext *page_ext;
502
503 if (!debug_guardpage_enabled())
504 return;
505
506 page_ext = lookup_page_ext(page);
507 __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
508
509 set_page_private(page, 0);
510 if (!is_migrate_isolate(migratetype))
511 __mod_zone_freepage_state(zone, (1 << order), migratetype);
452} 512}
453#else 513#else
454static inline void set_page_guard_flag(struct page *page) { } 514struct page_ext_operations debug_guardpage_ops = { NULL, };
455static inline void clear_page_guard_flag(struct page *page) { } 515static inline void set_page_guard(struct zone *zone, struct page *page,
516 unsigned int order, int migratetype) {}
517static inline void clear_page_guard(struct zone *zone, struct page *page,
518 unsigned int order, int migratetype) {}
456#endif 519#endif
457 520
458static inline void set_page_order(struct page *page, unsigned int order) 521static inline void set_page_order(struct page *page, unsigned int order)
@@ -468,29 +531,6 @@ static inline void rmv_page_order(struct page *page)
468} 531}
469 532
470/* 533/*
471 * Locate the struct page for both the matching buddy in our
472 * pair (buddy1) and the combined O(n+1) page they form (page).
473 *
474 * 1) Any buddy B1 will have an order O twin B2 which satisfies
475 * the following equation:
476 * B2 = B1 ^ (1 << O)
477 * For example, if the starting buddy (buddy2) is #8 its order
478 * 1 buddy is #10:
479 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
480 *
481 * 2) Any buddy B will have an order O+1 parent P which
482 * satisfies the following equation:
483 * P = B & ~(1 << O)
484 *
485 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
486 */
487static inline unsigned long
488__find_buddy_index(unsigned long page_idx, unsigned int order)
489{
490 return page_idx ^ (1 << order);
491}
492
493/*
494 * This function checks whether a page is free && is the buddy 534 * This function checks whether a page is free && is the buddy
495 * we can do coalesce a page and its buddy if 535 * we can do coalesce a page and its buddy if
496 * (a) the buddy is not in a hole && 536 * (a) the buddy is not in a hole &&
@@ -570,6 +610,7 @@ static inline void __free_one_page(struct page *page,
570 unsigned long combined_idx; 610 unsigned long combined_idx;
571 unsigned long uninitialized_var(buddy_idx); 611 unsigned long uninitialized_var(buddy_idx);
572 struct page *buddy; 612 struct page *buddy;
613 int max_order = MAX_ORDER;
573 614
574 VM_BUG_ON(!zone_is_initialized(zone)); 615 VM_BUG_ON(!zone_is_initialized(zone));
575 616
@@ -578,13 +619,24 @@ static inline void __free_one_page(struct page *page,
578 return; 619 return;
579 620
580 VM_BUG_ON(migratetype == -1); 621 VM_BUG_ON(migratetype == -1);
622 if (is_migrate_isolate(migratetype)) {
623 /*
624 * We restrict max order of merging to prevent merge
625 * between freepages on isolate pageblock and normal
626 * pageblock. Without this, pageblock isolation
627 * could cause incorrect freepage accounting.
628 */
629 max_order = min(MAX_ORDER, pageblock_order + 1);
630 } else {
631 __mod_zone_freepage_state(zone, 1 << order, migratetype);
632 }
581 633
582 page_idx = pfn & ((1 << MAX_ORDER) - 1); 634 page_idx = pfn & ((1 << max_order) - 1);
583 635
584 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); 636 VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
585 VM_BUG_ON_PAGE(bad_range(zone, page), page); 637 VM_BUG_ON_PAGE(bad_range(zone, page), page);
586 638
587 while (order < MAX_ORDER-1) { 639 while (order < max_order - 1) {
588 buddy_idx = __find_buddy_index(page_idx, order); 640 buddy_idx = __find_buddy_index(page_idx, order);
589 buddy = page + (buddy_idx - page_idx); 641 buddy = page + (buddy_idx - page_idx);
590 if (!page_is_buddy(page, buddy, order)) 642 if (!page_is_buddy(page, buddy, order))
@@ -594,10 +646,7 @@ static inline void __free_one_page(struct page *page,
594 * merge with it and move up one order. 646 * merge with it and move up one order.
595 */ 647 */
596 if (page_is_guard(buddy)) { 648 if (page_is_guard(buddy)) {
597 clear_page_guard_flag(buddy); 649 clear_page_guard(zone, buddy, order, migratetype);
598 set_page_private(page, 0);
599 __mod_zone_freepage_state(zone, 1 << order,
600 migratetype);
601 } else { 650 } else {
602 list_del(&buddy->lru); 651 list_del(&buddy->lru);
603 zone->free_area[order].nr_free--; 652 zone->free_area[order].nr_free--;
@@ -651,8 +700,10 @@ static inline int free_pages_check(struct page *page)
651 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 700 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
652 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 701 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
653 } 702 }
654 if (unlikely(mem_cgroup_bad_page_check(page))) 703#ifdef CONFIG_MEMCG
655 bad_reason = "cgroup check failed"; 704 if (unlikely(page->mem_cgroup))
705 bad_reason = "page still charged to cgroup";
706#endif
656 if (unlikely(bad_reason)) { 707 if (unlikely(bad_reason)) {
657 bad_page(page, bad_reason, bad_flags); 708 bad_page(page, bad_reason, bad_flags);
658 return 1; 709 return 1;
@@ -716,14 +767,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
716 /* must delete as __free_one_page list manipulates */ 767 /* must delete as __free_one_page list manipulates */
717 list_del(&page->lru); 768 list_del(&page->lru);
718 mt = get_freepage_migratetype(page); 769 mt = get_freepage_migratetype(page);
770 if (unlikely(has_isolate_pageblock(zone)))
771 mt = get_pageblock_migratetype(page);
772
719 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 773 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
720 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 774 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
721 trace_mm_page_pcpu_drain(page, 0, mt); 775 trace_mm_page_pcpu_drain(page, 0, mt);
722 if (likely(!is_migrate_isolate_page(page))) {
723 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
724 if (is_migrate_cma(mt))
725 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
726 }
727 } while (--to_free && --batch_free && !list_empty(list)); 776 } while (--to_free && --batch_free && !list_empty(list));
728 } 777 }
729 spin_unlock(&zone->lock); 778 spin_unlock(&zone->lock);
@@ -740,9 +789,11 @@ static void free_one_page(struct zone *zone,
740 if (nr_scanned) 789 if (nr_scanned)
741 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); 790 __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
742 791
792 if (unlikely(has_isolate_pageblock(zone) ||
793 is_migrate_isolate(migratetype))) {
794 migratetype = get_pfnblock_migratetype(page, pfn);
795 }
743 __free_one_page(page, pfn, zone, order, migratetype); 796 __free_one_page(page, pfn, zone, order, migratetype);
744 if (unlikely(!is_migrate_isolate(migratetype)))
745 __mod_zone_freepage_state(zone, 1 << order, migratetype);
746 spin_unlock(&zone->lock); 797 spin_unlock(&zone->lock);
747} 798}
748 799
@@ -751,6 +802,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
751 int i; 802 int i;
752 int bad = 0; 803 int bad = 0;
753 804
805 VM_BUG_ON_PAGE(PageTail(page), page);
806 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
807
754 trace_mm_page_free(page, order); 808 trace_mm_page_free(page, order);
755 kmemcheck_free_shadow(page, order); 809 kmemcheck_free_shadow(page, order);
756 810
@@ -761,6 +815,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
761 if (bad) 815 if (bad)
762 return false; 816 return false;
763 817
818 reset_page_owner(page, order);
819
764 if (!PageHighMem(page)) { 820 if (!PageHighMem(page)) {
765 debug_check_no_locks_freed(page_address(page), 821 debug_check_no_locks_freed(page_address(page),
766 PAGE_SIZE << order); 822 PAGE_SIZE << order);
@@ -867,23 +923,18 @@ static inline void expand(struct zone *zone, struct page *page,
867 size >>= 1; 923 size >>= 1;
868 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); 924 VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
869 925
870#ifdef CONFIG_DEBUG_PAGEALLOC 926 if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
871 if (high < debug_guardpage_minorder()) { 927 debug_guardpage_enabled() &&
928 high < debug_guardpage_minorder()) {
872 /* 929 /*
873 * Mark as guard pages (or page), that will allow to 930 * Mark as guard pages (or page), that will allow to
874 * merge back to allocator when buddy will be freed. 931 * merge back to allocator when buddy will be freed.
875 * Corresponding page table entries will not be touched, 932 * Corresponding page table entries will not be touched,
876 * pages will stay not present in virtual address space 933 * pages will stay not present in virtual address space
877 */ 934 */
878 INIT_LIST_HEAD(&page[size].lru); 935 set_page_guard(zone, &page[size], high, migratetype);
879 set_page_guard_flag(&page[size]);
880 set_page_private(&page[size], high);
881 /* Guard pages are not available for any usage */
882 __mod_zone_freepage_state(zone, -(1 << high),
883 migratetype);
884 continue; 936 continue;
885 } 937 }
886#endif
887 list_add(&page[size].lru, &area->free_list[migratetype]); 938 list_add(&page[size].lru, &area->free_list[migratetype]);
888 area->nr_free++; 939 area->nr_free++;
889 set_page_order(&page[size], high); 940 set_page_order(&page[size], high);
@@ -908,8 +959,10 @@ static inline int check_new_page(struct page *page)
908 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 959 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
909 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 960 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
910 } 961 }
911 if (unlikely(mem_cgroup_bad_page_check(page))) 962#ifdef CONFIG_MEMCG
912 bad_reason = "cgroup check failed"; 963 if (unlikely(page->mem_cgroup))
964 bad_reason = "page still charged to cgroup";
965#endif
913 if (unlikely(bad_reason)) { 966 if (unlikely(bad_reason)) {
914 bad_page(page, bad_reason, bad_flags); 967 bad_page(page, bad_reason, bad_flags);
915 return 1; 968 return 1;
@@ -939,6 +992,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
939 if (order && (gfp_flags & __GFP_COMP)) 992 if (order && (gfp_flags & __GFP_COMP))
940 prep_compound_page(page, order); 993 prep_compound_page(page, order);
941 994
995 set_page_owner(page, order, gfp_flags);
996
942 return 0; 997 return 0;
943} 998}
944 999
@@ -1014,7 +1069,7 @@ int move_freepages(struct zone *zone,
1014 * Remove at a later date when no bug reports exist related to 1069 * Remove at a later date when no bug reports exist related to
1015 * grouping pages by mobility 1070 * grouping pages by mobility
1016 */ 1071 */
1017 BUG_ON(page_zone(start_page) != page_zone(end_page)); 1072 VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1018#endif 1073#endif
1019 1074
1020 for (page = start_page; page <= end_page;) { 1075 for (page = start_page; page <= end_page;) {
@@ -1277,55 +1332,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1277#endif 1332#endif
1278 1333
1279/* 1334/*
1280 * Drain pages of the indicated processor. 1335 * Drain pcplists of the indicated processor and zone.
1281 * 1336 *
1282 * The processor must either be the current processor and the 1337 * The processor must either be the current processor and the
1283 * thread pinned to the current processor or a processor that 1338 * thread pinned to the current processor or a processor that
1284 * is not online. 1339 * is not online.
1285 */ 1340 */
1286static void drain_pages(unsigned int cpu) 1341static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1287{ 1342{
1288 unsigned long flags; 1343 unsigned long flags;
1289 struct zone *zone; 1344 struct per_cpu_pageset *pset;
1345 struct per_cpu_pages *pcp;
1290 1346
1291 for_each_populated_zone(zone) { 1347 local_irq_save(flags);
1292 struct per_cpu_pageset *pset; 1348 pset = per_cpu_ptr(zone->pageset, cpu);
1293 struct per_cpu_pages *pcp;
1294 1349
1295 local_irq_save(flags); 1350 pcp = &pset->pcp;
1296 pset = per_cpu_ptr(zone->pageset, cpu); 1351 if (pcp->count) {
1352 free_pcppages_bulk(zone, pcp->count, pcp);
1353 pcp->count = 0;
1354 }
1355 local_irq_restore(flags);
1356}
1297 1357
1298 pcp = &pset->pcp; 1358/*
1299 if (pcp->count) { 1359 * Drain pcplists of all zones on the indicated processor.
1300 free_pcppages_bulk(zone, pcp->count, pcp); 1360 *
1301 pcp->count = 0; 1361 * The processor must either be the current processor and the
1302 } 1362 * thread pinned to the current processor or a processor that
1303 local_irq_restore(flags); 1363 * is not online.
1364 */
1365static void drain_pages(unsigned int cpu)
1366{
1367 struct zone *zone;
1368
1369 for_each_populated_zone(zone) {
1370 drain_pages_zone(cpu, zone);
1304 } 1371 }
1305} 1372}
1306 1373
1307/* 1374/*
1308 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1375 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1376 *
1377 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
1378 * the single zone's pages.
1309 */ 1379 */
1310void drain_local_pages(void *arg) 1380void drain_local_pages(struct zone *zone)
1311{ 1381{
1312 drain_pages(smp_processor_id()); 1382 int cpu = smp_processor_id();
1383
1384 if (zone)
1385 drain_pages_zone(cpu, zone);
1386 else
1387 drain_pages(cpu);
1313} 1388}
1314 1389
1315/* 1390/*
1316 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1391 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1317 * 1392 *
1393 * When zone parameter is non-NULL, spill just the single zone's pages.
1394 *
1318 * Note that this code is protected against sending an IPI to an offline 1395 * Note that this code is protected against sending an IPI to an offline
1319 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1396 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1320 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1397 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1321 * nothing keeps CPUs from showing up after we populated the cpumask and 1398 * nothing keeps CPUs from showing up after we populated the cpumask and
1322 * before the call to on_each_cpu_mask(). 1399 * before the call to on_each_cpu_mask().
1323 */ 1400 */
1324void drain_all_pages(void) 1401void drain_all_pages(struct zone *zone)
1325{ 1402{
1326 int cpu; 1403 int cpu;
1327 struct per_cpu_pageset *pcp;
1328 struct zone *zone;
1329 1404
1330 /* 1405 /*
1331 * Allocate in the BSS so we wont require allocation in 1406 * Allocate in the BSS so we wont require allocation in
@@ -1340,20 +1415,31 @@ void drain_all_pages(void)
1340 * disables preemption as part of its processing 1415 * disables preemption as part of its processing
1341 */ 1416 */
1342 for_each_online_cpu(cpu) { 1417 for_each_online_cpu(cpu) {
1418 struct per_cpu_pageset *pcp;
1419 struct zone *z;
1343 bool has_pcps = false; 1420 bool has_pcps = false;
1344 for_each_populated_zone(zone) { 1421
1422 if (zone) {
1345 pcp = per_cpu_ptr(zone->pageset, cpu); 1423 pcp = per_cpu_ptr(zone->pageset, cpu);
1346 if (pcp->pcp.count) { 1424 if (pcp->pcp.count)
1347 has_pcps = true; 1425 has_pcps = true;
1348 break; 1426 } else {
1427 for_each_populated_zone(z) {
1428 pcp = per_cpu_ptr(z->pageset, cpu);
1429 if (pcp->pcp.count) {
1430 has_pcps = true;
1431 break;
1432 }
1349 } 1433 }
1350 } 1434 }
1435
1351 if (has_pcps) 1436 if (has_pcps)
1352 cpumask_set_cpu(cpu, &cpus_with_pcps); 1437 cpumask_set_cpu(cpu, &cpus_with_pcps);
1353 else 1438 else
1354 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1439 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1355 } 1440 }
1356 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1441 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
1442 zone, 1);
1357} 1443}
1358 1444
1359#ifdef CONFIG_HIBERNATION 1445#ifdef CONFIG_HIBERNATION
@@ -1480,12 +1566,15 @@ void split_page(struct page *page, unsigned int order)
1480 split_page(virt_to_page(page[0].shadow), order); 1566 split_page(virt_to_page(page[0].shadow), order);
1481#endif 1567#endif
1482 1568
1483 for (i = 1; i < (1 << order); i++) 1569 set_page_owner(page, 0, 0);
1570 for (i = 1; i < (1 << order); i++) {
1484 set_page_refcounted(page + i); 1571 set_page_refcounted(page + i);
1572 set_page_owner(page + i, 0, 0);
1573 }
1485} 1574}
1486EXPORT_SYMBOL_GPL(split_page); 1575EXPORT_SYMBOL_GPL(split_page);
1487 1576
1488static int __isolate_free_page(struct page *page, unsigned int order) 1577int __isolate_free_page(struct page *page, unsigned int order)
1489{ 1578{
1490 unsigned long watermark; 1579 unsigned long watermark;
1491 struct zone *zone; 1580 struct zone *zone;
@@ -1521,6 +1610,7 @@ static int __isolate_free_page(struct page *page, unsigned int order)
1521 } 1610 }
1522 } 1611 }
1523 1612
1613 set_page_owner(page, order, 0);
1524 return 1UL << order; 1614 return 1UL << order;
1525} 1615}
1526 1616
@@ -1613,8 +1703,8 @@ again:
1613 1703
1614 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); 1704 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1615 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && 1705 if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
1616 !zone_is_fair_depleted(zone)) 1706 !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
1617 zone_set_flag(zone, ZONE_FAIR_DEPLETED); 1707 set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1618 1708
1619 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1709 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1620 zone_statistics(preferred_zone, zone, gfp_flags); 1710 zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1715,7 +1805,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1715 unsigned long mark, int classzone_idx, int alloc_flags, 1805 unsigned long mark, int classzone_idx, int alloc_flags,
1716 long free_pages) 1806 long free_pages)
1717{ 1807{
1718 /* free_pages my go negative - that's OK */ 1808 /* free_pages may go negative - that's OK */
1719 long min = mark; 1809 long min = mark;
1720 int o; 1810 int o;
1721 long free_cma = 0; 1811 long free_cma = 0;
@@ -1934,7 +2024,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
1934 mod_zone_page_state(zone, NR_ALLOC_BATCH, 2024 mod_zone_page_state(zone, NR_ALLOC_BATCH,
1935 high_wmark_pages(zone) - low_wmark_pages(zone) - 2025 high_wmark_pages(zone) - low_wmark_pages(zone) -
1936 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); 2026 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
1937 zone_clear_flag(zone, ZONE_FAIR_DEPLETED); 2027 clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
1938 } while (zone++ != preferred_zone); 2028 } while (zone++ != preferred_zone);
1939} 2029}
1940 2030
@@ -1963,7 +2053,7 @@ zonelist_scan:
1963 2053
1964 /* 2054 /*
1965 * Scan zonelist, looking for a zone with enough free. 2055 * Scan zonelist, looking for a zone with enough free.
1966 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. 2056 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
1967 */ 2057 */
1968 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2058 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1969 high_zoneidx, nodemask) { 2059 high_zoneidx, nodemask) {
@@ -1974,7 +2064,7 @@ zonelist_scan:
1974 continue; 2064 continue;
1975 if (cpusets_enabled() && 2065 if (cpusets_enabled() &&
1976 (alloc_flags & ALLOC_CPUSET) && 2066 (alloc_flags & ALLOC_CPUSET) &&
1977 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 2067 !cpuset_zone_allowed(zone, gfp_mask))
1978 continue; 2068 continue;
1979 /* 2069 /*
1980 * Distribute pages in proportion to the individual 2070 * Distribute pages in proportion to the individual
@@ -1985,7 +2075,7 @@ zonelist_scan:
1985 if (alloc_flags & ALLOC_FAIR) { 2075 if (alloc_flags & ALLOC_FAIR) {
1986 if (!zone_local(preferred_zone, zone)) 2076 if (!zone_local(preferred_zone, zone))
1987 break; 2077 break;
1988 if (zone_is_fair_depleted(zone)) { 2078 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
1989 nr_fair_skipped++; 2079 nr_fair_skipped++;
1990 continue; 2080 continue;
1991 } 2081 }
@@ -2253,6 +2343,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2253 } 2343 }
2254 2344
2255 /* 2345 /*
2346 * PM-freezer should be notified that there might be an OOM killer on
2347 * its way to kill and wake somebody up. This is too early and we might
2348 * end up not killing anything but false positives are acceptable.
2349 * See freeze_processes.
2350 */
2351 note_oom_kill();
2352
2353 /*
2256 * Go through the zonelist yet one more time, keep very high watermark 2354 * Go through the zonelist yet one more time, keep very high watermark
2257 * here, this is only to catch a parallel oom killing, we must fail if 2355 * here, this is only to catch a parallel oom killing, we must fail if
2258 * we're still under heavy pressure. 2356 * we're still under heavy pressure.
@@ -2296,58 +2394,59 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2296 struct zonelist *zonelist, enum zone_type high_zoneidx, 2394 struct zonelist *zonelist, enum zone_type high_zoneidx,
2297 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2395 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2298 int classzone_idx, int migratetype, enum migrate_mode mode, 2396 int classzone_idx, int migratetype, enum migrate_mode mode,
2299 bool *contended_compaction, bool *deferred_compaction, 2397 int *contended_compaction, bool *deferred_compaction)
2300 unsigned long *did_some_progress)
2301{ 2398{
2302 if (!order) 2399 unsigned long compact_result;
2303 return NULL; 2400 struct page *page;
2304 2401
2305 if (compaction_deferred(preferred_zone, order)) { 2402 if (!order)
2306 *deferred_compaction = true;
2307 return NULL; 2403 return NULL;
2308 }
2309 2404
2310 current->flags |= PF_MEMALLOC; 2405 current->flags |= PF_MEMALLOC;
2311 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2406 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2312 nodemask, mode, 2407 nodemask, mode,
2313 contended_compaction); 2408 contended_compaction,
2409 alloc_flags, classzone_idx);
2314 current->flags &= ~PF_MEMALLOC; 2410 current->flags &= ~PF_MEMALLOC;
2315 2411
2316 if (*did_some_progress != COMPACT_SKIPPED) { 2412 switch (compact_result) {
2317 struct page *page; 2413 case COMPACT_DEFERRED:
2318 2414 *deferred_compaction = true;
2319 /* Page migration frees to the PCP lists but we want merging */ 2415 /* fall-through */
2320 drain_pages(get_cpu()); 2416 case COMPACT_SKIPPED:
2321 put_cpu(); 2417 return NULL;
2418 default:
2419 break;
2420 }
2322 2421
2323 page = get_page_from_freelist(gfp_mask, nodemask, 2422 /*
2324 order, zonelist, high_zoneidx, 2423 * At least in one zone compaction wasn't deferred or skipped, so let's
2325 alloc_flags & ~ALLOC_NO_WATERMARKS, 2424 * count a compaction stall
2326 preferred_zone, classzone_idx, migratetype); 2425 */
2327 if (page) { 2426 count_vm_event(COMPACTSTALL);
2328 preferred_zone->compact_blockskip_flush = false;
2329 compaction_defer_reset(preferred_zone, order, true);
2330 count_vm_event(COMPACTSUCCESS);
2331 return page;
2332 }
2333 2427
2334 /* 2428 page = get_page_from_freelist(gfp_mask, nodemask,
2335 * It's bad if compaction run occurs and fails. 2429 order, zonelist, high_zoneidx,
2336 * The most likely reason is that pages exist, 2430 alloc_flags & ~ALLOC_NO_WATERMARKS,
2337 * but not enough to satisfy watermarks. 2431 preferred_zone, classzone_idx, migratetype);
2338 */
2339 count_vm_event(COMPACTFAIL);
2340 2432
2341 /* 2433 if (page) {
2342 * As async compaction considers a subset of pageblocks, only 2434 struct zone *zone = page_zone(page);
2343 * defer if the failure was a sync compaction failure.
2344 */
2345 if (mode != MIGRATE_ASYNC)
2346 defer_compaction(preferred_zone, order);
2347 2435
2348 cond_resched(); 2436 zone->compact_blockskip_flush = false;
2437 compaction_defer_reset(zone, order, true);
2438 count_vm_event(COMPACTSUCCESS);
2439 return page;
2349 } 2440 }
2350 2441
2442 /*
2443 * It's bad if compaction run occurs and fails. The most likely reason
2444 * is that pages exist, but not enough to satisfy watermarks.
2445 */
2446 count_vm_event(COMPACTFAIL);
2447
2448 cond_resched();
2449
2351 return NULL; 2450 return NULL;
2352} 2451}
2353#else 2452#else
@@ -2355,9 +2454,8 @@ static inline struct page *
2355__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2454__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2356 struct zonelist *zonelist, enum zone_type high_zoneidx, 2455 struct zonelist *zonelist, enum zone_type high_zoneidx,
2357 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2456 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
2358 int classzone_idx, int migratetype, 2457 int classzone_idx, int migratetype, enum migrate_mode mode,
2359 enum migrate_mode mode, bool *contended_compaction, 2458 int *contended_compaction, bool *deferred_compaction)
2360 bool *deferred_compaction, unsigned long *did_some_progress)
2361{ 2459{
2362 return NULL; 2460 return NULL;
2363} 2461}
@@ -2422,7 +2520,7 @@ retry:
2422 * pages are pinned on the per-cpu lists. Drain them and try again 2520 * pages are pinned on the per-cpu lists. Drain them and try again
2423 */ 2521 */
2424 if (!page && !drained) { 2522 if (!page && !drained) {
2425 drain_all_pages(); 2523 drain_all_pages(NULL);
2426 drained = true; 2524 drained = true;
2427 goto retry; 2525 goto retry;
2428 } 2526 }
@@ -2457,12 +2555,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2457static void wake_all_kswapds(unsigned int order, 2555static void wake_all_kswapds(unsigned int order,
2458 struct zonelist *zonelist, 2556 struct zonelist *zonelist,
2459 enum zone_type high_zoneidx, 2557 enum zone_type high_zoneidx,
2460 struct zone *preferred_zone) 2558 struct zone *preferred_zone,
2559 nodemask_t *nodemask)
2461{ 2560{
2462 struct zoneref *z; 2561 struct zoneref *z;
2463 struct zone *zone; 2562 struct zone *zone;
2464 2563
2465 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2564 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2565 high_zoneidx, nodemask)
2466 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2566 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2467} 2567}
2468 2568
@@ -2492,7 +2592,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2492 alloc_flags |= ALLOC_HARDER; 2592 alloc_flags |= ALLOC_HARDER;
2493 /* 2593 /*
2494 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the 2594 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
2495 * comment for __cpuset_node_allowed_softwall(). 2595 * comment for __cpuset_node_allowed().
2496 */ 2596 */
2497 alloc_flags &= ~ALLOC_CPUSET; 2597 alloc_flags &= ~ALLOC_CPUSET;
2498 } else if (unlikely(rt_task(current)) && !in_interrupt()) 2598 } else if (unlikely(rt_task(current)) && !in_interrupt())
@@ -2509,7 +2609,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2509 alloc_flags |= ALLOC_NO_WATERMARKS; 2609 alloc_flags |= ALLOC_NO_WATERMARKS;
2510 } 2610 }
2511#ifdef CONFIG_CMA 2611#ifdef CONFIG_CMA
2512 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) 2612 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2513 alloc_flags |= ALLOC_CMA; 2613 alloc_flags |= ALLOC_CMA;
2514#endif 2614#endif
2515 return alloc_flags; 2615 return alloc_flags;
@@ -2533,7 +2633,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2533 unsigned long did_some_progress; 2633 unsigned long did_some_progress;
2534 enum migrate_mode migration_mode = MIGRATE_ASYNC; 2634 enum migrate_mode migration_mode = MIGRATE_ASYNC;
2535 bool deferred_compaction = false; 2635 bool deferred_compaction = false;
2536 bool contended_compaction = false; 2636 int contended_compaction = COMPACT_CONTENDED_NONE;
2537 2637
2538 /* 2638 /*
2539 * In the slowpath, we sanity check order to avoid ever trying to 2639 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2660,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2560 2660
2561restart: 2661restart:
2562 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2662 if (!(gfp_mask & __GFP_NO_KSWAPD))
2563 wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); 2663 wake_all_kswapds(order, zonelist, high_zoneidx,
2664 preferred_zone, nodemask);
2564 2665
2565 /* 2666 /*
2566 * OK, we're below the kswapd watermark and have kicked background 2667 * OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2734,40 @@ rebalance:
2633 preferred_zone, 2734 preferred_zone,
2634 classzone_idx, migratetype, 2735 classzone_idx, migratetype,
2635 migration_mode, &contended_compaction, 2736 migration_mode, &contended_compaction,
2636 &deferred_compaction, 2737 &deferred_compaction);
2637 &did_some_progress);
2638 if (page) 2738 if (page)
2639 goto got_pg; 2739 goto got_pg;
2640 2740
2641 /* 2741 /* Checks for THP-specific high-order allocations */
2642 * If compaction is deferred for high-order allocations, it is because 2742 if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
2643 * sync compaction recently failed. In this is the case and the caller 2743 /*
2644 * requested a movable allocation that does not heavily disrupt the 2744 * If compaction is deferred for high-order allocations, it is
2645 * system then fail the allocation instead of entering direct reclaim. 2745 * because sync compaction recently failed. If this is the case
2646 */ 2746 * and the caller requested a THP allocation, we do not want
2647 if ((deferred_compaction || contended_compaction) && 2747 * to heavily disrupt the system, so we fail the allocation
2648 (gfp_mask & __GFP_NO_KSWAPD)) 2748 * instead of entering direct reclaim.
2649 goto nopage; 2749 */
2750 if (deferred_compaction)
2751 goto nopage;
2752
2753 /*
2754 * In all zones where compaction was attempted (and not
2755 * deferred or skipped), lock contention has been detected.
2756 * For THP allocation we do not want to disrupt the others
2757 * so we fallback to base pages instead.
2758 */
2759 if (contended_compaction == COMPACT_CONTENDED_LOCK)
2760 goto nopage;
2761
2762 /*
2763 * If compaction was aborted due to need_resched(), we do not
2764 * want to further increase allocation latency, unless it is
2765 * khugepaged trying to collapse.
2766 */
2767 if (contended_compaction == COMPACT_CONTENDED_SCHED
2768 && !(current->flags & PF_KTHREAD))
2769 goto nopage;
2770 }
2650 2771
2651 /* 2772 /*
2652 * It can become very expensive to allocate transparent hugepages at 2773 * It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2847,7 @@ rebalance:
2726 preferred_zone, 2847 preferred_zone,
2727 classzone_idx, migratetype, 2848 classzone_idx, migratetype,
2728 migration_mode, &contended_compaction, 2849 migration_mode, &contended_compaction,
2729 &deferred_compaction, 2850 &deferred_compaction);
2730 &did_some_progress);
2731 if (page) 2851 if (page)
2732 goto got_pg; 2852 goto got_pg;
2733 } 2853 }
@@ -2753,7 +2873,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2753 struct zone *preferred_zone; 2873 struct zone *preferred_zone;
2754 struct zoneref *preferred_zoneref; 2874 struct zoneref *preferred_zoneref;
2755 struct page *page = NULL; 2875 struct page *page = NULL;
2756 int migratetype = allocflags_to_migratetype(gfp_mask); 2876 int migratetype = gfpflags_to_migratetype(gfp_mask);
2757 unsigned int cpuset_mems_cookie; 2877 unsigned int cpuset_mems_cookie;
2758 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2878 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2759 int classzone_idx; 2879 int classzone_idx;
@@ -2775,6 +2895,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2775 if (unlikely(!zonelist->_zonerefs->zone)) 2895 if (unlikely(!zonelist->_zonerefs->zone))
2776 return NULL; 2896 return NULL;
2777 2897
2898 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
2899 alloc_flags |= ALLOC_CMA;
2900
2778retry_cpuset: 2901retry_cpuset:
2779 cpuset_mems_cookie = read_mems_allowed_begin(); 2902 cpuset_mems_cookie = read_mems_allowed_begin();
2780 2903
@@ -2786,10 +2909,6 @@ retry_cpuset:
2786 goto out; 2909 goto out;
2787 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2910 classzone_idx = zonelist_zone_idx(preferred_zoneref);
2788 2911
2789#ifdef CONFIG_CMA
2790 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2791 alloc_flags |= ALLOC_CMA;
2792#endif
2793 /* First allocation attempt */ 2912 /* First allocation attempt */
2794 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2913 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2795 zonelist, high_zoneidx, alloc_flags, 2914 zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3698,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3579 zonelist->_zonerefs[pos].zone_idx = 0; 3698 zonelist->_zonerefs[pos].zone_idx = 0;
3580} 3699}
3581 3700
3701#if defined(CONFIG_64BIT)
3702/*
3703 * Devices that require DMA32/DMA are relatively rare and do not justify a
3704 * penalty to every machine in case the specialised case applies. Default
3705 * to Node-ordering on 64-bit NUMA machines
3706 */
3707static int default_zonelist_order(void)
3708{
3709 return ZONELIST_ORDER_NODE;
3710}
3711#else
3712/*
3713 * On 32-bit, the Normal zone needs to be preserved for allocations accessible
3714 * by the kernel. If processes running on node 0 deplete the low memory zone
3715 * then reclaim will occur more frequency increasing stalls and potentially
3716 * be easier to OOM if a large percentage of the zone is under writeback or
3717 * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
3718 * Hence, default to zone ordering on 32-bit.
3719 */
3582static int default_zonelist_order(void) 3720static int default_zonelist_order(void)
3583{ 3721{
3584 int nid, zone_type;
3585 unsigned long low_kmem_size, total_size;
3586 struct zone *z;
3587 int average_size;
3588 /*
3589 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3590 * If they are really small and used heavily, the system can fall
3591 * into OOM very easily.
3592 * This function detect ZONE_DMA/DMA32 size and configures zone order.
3593 */
3594 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
3595 low_kmem_size = 0;
3596 total_size = 0;
3597 for_each_online_node(nid) {
3598 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3599 z = &NODE_DATA(nid)->node_zones[zone_type];
3600 if (populated_zone(z)) {
3601 if (zone_type < ZONE_NORMAL)
3602 low_kmem_size += z->managed_pages;
3603 total_size += z->managed_pages;
3604 } else if (zone_type == ZONE_NORMAL) {
3605 /*
3606 * If any node has only lowmem, then node order
3607 * is preferred to allow kernel allocations
3608 * locally; otherwise, they can easily infringe
3609 * on other nodes when there is an abundance of
3610 * lowmem available to allocate from.
3611 */
3612 return ZONELIST_ORDER_NODE;
3613 }
3614 }
3615 }
3616 if (!low_kmem_size || /* there are no DMA area. */
3617 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
3618 return ZONELIST_ORDER_NODE;
3619 /*
3620 * look into each node's config.
3621 * If there is a node whose DMA/DMA32 memory is very big area on
3622 * local memory, NODE_ORDER may be suitable.
3623 */
3624 average_size = total_size /
3625 (nodes_weight(node_states[N_MEMORY]) + 1);
3626 for_each_online_node(nid) {
3627 low_kmem_size = 0;
3628 total_size = 0;
3629 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
3630 z = &NODE_DATA(nid)->node_zones[zone_type];
3631 if (populated_zone(z)) {
3632 if (zone_type < ZONE_NORMAL)
3633 low_kmem_size += z->present_pages;
3634 total_size += z->present_pages;
3635 }
3636 }
3637 if (low_kmem_size &&
3638 total_size > average_size && /* ignore small node */
3639 low_kmem_size > total_size * 70/100)
3640 return ZONELIST_ORDER_NODE;
3641 }
3642 return ZONELIST_ORDER_ZONE; 3722 return ZONELIST_ORDER_ZONE;
3643} 3723}
3724#endif /* CONFIG_64BIT */
3644 3725
3645static void set_zonelist_order(void) 3726static void set_zonelist_order(void)
3646{ 3727{
@@ -3899,14 +3980,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3899 else 3980 else
3900 page_group_by_mobility_disabled = 0; 3981 page_group_by_mobility_disabled = 0;
3901 3982
3902 printk("Built %i zonelists in %s order, mobility grouping %s. " 3983 pr_info("Built %i zonelists in %s order, mobility grouping %s. "
3903 "Total pages: %ld\n", 3984 "Total pages: %ld\n",
3904 nr_online_nodes, 3985 nr_online_nodes,
3905 zonelist_order_name[current_zonelist_order], 3986 zonelist_order_name[current_zonelist_order],
3906 page_group_by_mobility_disabled ? "off" : "on", 3987 page_group_by_mobility_disabled ? "off" : "on",
3907 vm_total_pages); 3988 vm_total_pages);
3908#ifdef CONFIG_NUMA 3989#ifdef CONFIG_NUMA
3909 printk("Policy zone: %s\n", zone_names[policy_zone]); 3990 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
3910#endif 3991#endif
3911} 3992}
3912 3993
@@ -4838,7 +4919,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4838#endif 4919#endif
4839 init_waitqueue_head(&pgdat->kswapd_wait); 4920 init_waitqueue_head(&pgdat->kswapd_wait);
4840 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4921 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4841 pgdat_page_cgroup_init(pgdat); 4922 pgdat_page_ext_init(pgdat);
4842 4923
4843 for (j = 0; j < MAX_NR_ZONES; j++) { 4924 for (j = 0; j < MAX_NR_ZONES; j++) {
4844 struct zone *zone = pgdat->node_zones + j; 4925 struct zone *zone = pgdat->node_zones + j;
@@ -4857,16 +4938,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4857 * and per-cpu initialisations 4938 * and per-cpu initialisations
4858 */ 4939 */
4859 memmap_pages = calc_memmap_size(size, realsize); 4940 memmap_pages = calc_memmap_size(size, realsize);
4860 if (freesize >= memmap_pages) { 4941 if (!is_highmem_idx(j)) {
4861 freesize -= memmap_pages; 4942 if (freesize >= memmap_pages) {
4862 if (memmap_pages) 4943 freesize -= memmap_pages;
4863 printk(KERN_DEBUG 4944 if (memmap_pages)
4864 " %s zone: %lu pages used for memmap\n", 4945 printk(KERN_DEBUG
4865 zone_names[j], memmap_pages); 4946 " %s zone: %lu pages used for memmap\n",
4866 } else 4947 zone_names[j], memmap_pages);
4867 printk(KERN_WARNING 4948 } else
4868 " %s zone: %lu pages exceeds freesize %lu\n", 4949 printk(KERN_WARNING
4869 zone_names[j], memmap_pages, freesize); 4950 " %s zone: %lu pages exceeds freesize %lu\n",
4951 zone_names[j], memmap_pages, freesize);
4952 }
4870 4953
4871 /* Account for reserved pages */ 4954 /* Account for reserved pages */
4872 if (j == 0 && freesize > dma_reserve) { 4955 if (j == 0 && freesize > dma_reserve) {
@@ -4976,6 +5059,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4976 pgdat->node_start_pfn = node_start_pfn; 5059 pgdat->node_start_pfn = node_start_pfn;
4977#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5060#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4978 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5061 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5062 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
5063 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
4979#endif 5064#endif
4980 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5065 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4981 zones_size, zholes_size); 5066 zones_size, zholes_size);
@@ -5338,33 +5423,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5338 find_zone_movable_pfns_for_nodes(); 5423 find_zone_movable_pfns_for_nodes();
5339 5424
5340 /* Print out the zone ranges */ 5425 /* Print out the zone ranges */
5341 printk("Zone ranges:\n"); 5426 pr_info("Zone ranges:\n");
5342 for (i = 0; i < MAX_NR_ZONES; i++) { 5427 for (i = 0; i < MAX_NR_ZONES; i++) {
5343 if (i == ZONE_MOVABLE) 5428 if (i == ZONE_MOVABLE)
5344 continue; 5429 continue;
5345 printk(KERN_CONT " %-8s ", zone_names[i]); 5430 pr_info(" %-8s ", zone_names[i]);
5346 if (arch_zone_lowest_possible_pfn[i] == 5431 if (arch_zone_lowest_possible_pfn[i] ==
5347 arch_zone_highest_possible_pfn[i]) 5432 arch_zone_highest_possible_pfn[i])
5348 printk(KERN_CONT "empty\n"); 5433 pr_cont("empty\n");
5349 else 5434 else
5350 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5435 pr_cont("[mem %0#10lx-%0#10lx]\n",
5351 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5436 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5352 (arch_zone_highest_possible_pfn[i] 5437 (arch_zone_highest_possible_pfn[i]
5353 << PAGE_SHIFT) - 1); 5438 << PAGE_SHIFT) - 1);
5354 } 5439 }
5355 5440
5356 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5441 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5357 printk("Movable zone start for each node\n"); 5442 pr_info("Movable zone start for each node\n");
5358 for (i = 0; i < MAX_NUMNODES; i++) { 5443 for (i = 0; i < MAX_NUMNODES; i++) {
5359 if (zone_movable_pfn[i]) 5444 if (zone_movable_pfn[i])
5360 printk(" Node %d: %#010lx\n", i, 5445 pr_info(" Node %d: %#010lx\n", i,
5361 zone_movable_pfn[i] << PAGE_SHIFT); 5446 zone_movable_pfn[i] << PAGE_SHIFT);
5362 } 5447 }
5363 5448
5364 /* Print out the early node map */ 5449 /* Print out the early node map */
5365 printk("Early memory node ranges\n"); 5450 pr_info("Early memory node ranges\n");
5366 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5451 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5367 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5452 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5368 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5453 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5369 5454
5370 /* Initialise every node */ 5455 /* Initialise every node */
@@ -5500,9 +5585,9 @@ void __init mem_init_print_info(const char *str)
5500 5585
5501#undef adj_init_size 5586#undef adj_init_size
5502 5587
5503 printk("Memory: %luK/%luK available " 5588 pr_info("Memory: %luK/%luK available "
5504 "(%luK kernel code, %luK rwdata, %luK rodata, " 5589 "(%luK kernel code, %luK rwdata, %luK rodata, "
5505 "%luK init, %luK bss, %luK reserved" 5590 "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
5506#ifdef CONFIG_HIGHMEM 5591#ifdef CONFIG_HIGHMEM
5507 ", %luK highmem" 5592 ", %luK highmem"
5508#endif 5593#endif
@@ -5510,7 +5595,8 @@ void __init mem_init_print_info(const char *str)
5510 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), 5595 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5511 codesize >> 10, datasize >> 10, rosize >> 10, 5596 codesize >> 10, datasize >> 10, rosize >> 10,
5512 (init_data_size + init_code_size) >> 10, bss_size >> 10, 5597 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5513 (physpages - totalram_pages) << (PAGE_SHIFT-10), 5598 (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
5599 totalcma_pages << (PAGE_SHIFT-10),
5514#ifdef CONFIG_HIGHMEM 5600#ifdef CONFIG_HIGHMEM
5515 totalhigh_pages << (PAGE_SHIFT-10), 5601 totalhigh_pages << (PAGE_SHIFT-10),
5516#endif 5602#endif
@@ -6202,9 +6288,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6202 if (!PageLRU(page)) 6288 if (!PageLRU(page))
6203 found++; 6289 found++;
6204 /* 6290 /*
6205 * If there are RECLAIMABLE pages, we need to check it. 6291 * If there are RECLAIMABLE pages, we need to check
6206 * But now, memory offline itself doesn't call shrink_slab() 6292 * it. But now, memory offline itself doesn't call
6207 * and it still to be fixed. 6293 * shrink_node_slabs() and it still to be fixed.
6208 */ 6294 */
6209 /* 6295 /*
6210 * If the page is not RAM, page_count()should be 0. 6296 * If the page is not RAM, page_count()should be 0.
@@ -6277,8 +6363,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
6277 6363
6278 if (list_empty(&cc->migratepages)) { 6364 if (list_empty(&cc->migratepages)) {
6279 cc->nr_migratepages = 0; 6365 cc->nr_migratepages = 0;
6280 pfn = isolate_migratepages_range(cc->zone, cc, 6366 pfn = isolate_migratepages_range(cc, pfn, end);
6281 pfn, end, true);
6282 if (!pfn) { 6367 if (!pfn) {
6283 ret = -EINTR; 6368 ret = -EINTR;
6284 break; 6369 break;
@@ -6390,7 +6475,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6390 */ 6475 */
6391 6476
6392 lru_add_drain_all(); 6477 lru_add_drain_all();
6393 drain_all_pages(); 6478 drain_all_pages(cc.zone);
6394 6479
6395 order = 0; 6480 order = 0;
6396 outer_start = start; 6481 outer_start = start;
@@ -6404,13 +6489,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6404 6489
6405 /* Make sure the range is really isolated. */ 6490 /* Make sure the range is really isolated. */
6406 if (test_pages_isolated(outer_start, end, false)) { 6491 if (test_pages_isolated(outer_start, end, false)) {
6407 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 6492 pr_info("%s: [%lx, %lx) PFNs busy\n",
6408 outer_start, end); 6493 __func__, outer_start, end);
6409 ret = -EBUSY; 6494 ret = -EBUSY;
6410 goto done; 6495 goto done;
6411 } 6496 }
6412 6497
6413
6414 /* Grab isolated pages from freelists. */ 6498 /* Grab isolated pages from freelists. */
6415 outer_end = isolate_freepages_range(&cc, outer_start, end); 6499 outer_end = isolate_freepages_range(&cc, outer_start, end);
6416 if (!outer_end) { 6500 if (!outer_end) {
@@ -6554,97 +6638,3 @@ bool is_free_buddy_page(struct page *page)
6554 return order < MAX_ORDER; 6638 return order < MAX_ORDER;
6555} 6639}
6556#endif 6640#endif
6557
6558static const struct trace_print_flags pageflag_names[] = {
6559 {1UL << PG_locked, "locked" },
6560 {1UL << PG_error, "error" },
6561 {1UL << PG_referenced, "referenced" },
6562 {1UL << PG_uptodate, "uptodate" },
6563 {1UL << PG_dirty, "dirty" },
6564 {1UL << PG_lru, "lru" },
6565 {1UL << PG_active, "active" },
6566 {1UL << PG_slab, "slab" },
6567 {1UL << PG_owner_priv_1, "owner_priv_1" },
6568 {1UL << PG_arch_1, "arch_1" },
6569 {1UL << PG_reserved, "reserved" },
6570 {1UL << PG_private, "private" },
6571 {1UL << PG_private_2, "private_2" },
6572 {1UL << PG_writeback, "writeback" },
6573#ifdef CONFIG_PAGEFLAGS_EXTENDED
6574 {1UL << PG_head, "head" },
6575 {1UL << PG_tail, "tail" },
6576#else
6577 {1UL << PG_compound, "compound" },
6578#endif
6579 {1UL << PG_swapcache, "swapcache" },
6580 {1UL << PG_mappedtodisk, "mappedtodisk" },
6581 {1UL << PG_reclaim, "reclaim" },
6582 {1UL << PG_swapbacked, "swapbacked" },
6583 {1UL << PG_unevictable, "unevictable" },
6584#ifdef CONFIG_MMU
6585 {1UL << PG_mlocked, "mlocked" },
6586#endif
6587#ifdef CONFIG_ARCH_USES_PG_UNCACHED
6588 {1UL << PG_uncached, "uncached" },
6589#endif
6590#ifdef CONFIG_MEMORY_FAILURE
6591 {1UL << PG_hwpoison, "hwpoison" },
6592#endif
6593#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6594 {1UL << PG_compound_lock, "compound_lock" },
6595#endif
6596};
6597
6598static void dump_page_flags(unsigned long flags)
6599{
6600 const char *delim = "";
6601 unsigned long mask;
6602 int i;
6603
6604 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
6605
6606 printk(KERN_ALERT "page flags: %#lx(", flags);
6607
6608 /* remove zone id */
6609 flags &= (1UL << NR_PAGEFLAGS) - 1;
6610
6611 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
6612
6613 mask = pageflag_names[i].mask;
6614 if ((flags & mask) != mask)
6615 continue;
6616
6617 flags &= ~mask;
6618 printk("%s%s", delim, pageflag_names[i].name);
6619 delim = "|";
6620 }
6621
6622 /* check for left over flags */
6623 if (flags)
6624 printk("%s%#lx", delim, flags);
6625
6626 printk(")\n");
6627}
6628
6629void dump_page_badflags(struct page *page, const char *reason,
6630 unsigned long badflags)
6631{
6632 printk(KERN_ALERT
6633 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
6634 page, atomic_read(&page->_count), page_mapcount(page),
6635 page->mapping, page->index);
6636 dump_page_flags(page->flags);
6637 if (reason)
6638 pr_alert("page dumped because: %s\n", reason);
6639 if (page->flags & badflags) {
6640 pr_alert("bad because of flags:\n");
6641 dump_page_flags(page->flags & badflags);
6642 }
6643 mem_cgroup_print_bad_page(page);
6644}
6645
6646void dump_page(struct page *page, const char *reason)
6647{
6648 dump_page_badflags(page, reason, 0);
6649}
6650EXPORT_SYMBOL(dump_page);