aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
committerPaul Mundt <lethal@linux-sh.org>2011-01-13 01:06:28 -0500
commitf43dc23d5ea91fca257be02138a255f02d98e806 (patch)
treeb29722f6e965316e90ac97abf79923ced250dc21 /mm/page_alloc.c
parentf8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff)
parent4162cf64973df51fc885825bc9ca4d055891c49f (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts: arch/sh/kernel/cpu/sh2/setup-sh7619.c arch/sh/kernel/cpu/sh2a/setup-mxg.c arch/sh/kernel/cpu/sh2a/setup-sh7201.c arch/sh/kernel/cpu/sh2a/setup-sh7203.c arch/sh/kernel/cpu/sh2a/setup-sh7206.c arch/sh/kernel/cpu/sh3/setup-sh7705.c arch/sh/kernel/cpu/sh3/setup-sh770x.c arch/sh/kernel/cpu/sh3/setup-sh7710.c arch/sh/kernel/cpu/sh3/setup-sh7720.c arch/sh/kernel/cpu/sh4/setup-sh4-202.c arch/sh/kernel/cpu/sh4/setup-sh7750.c arch/sh/kernel/cpu/sh4/setup-sh7760.c arch/sh/kernel/cpu/sh4a/setup-sh7343.c arch/sh/kernel/cpu/sh4a/setup-sh7366.c arch/sh/kernel/cpu/sh4a/setup-sh7722.c arch/sh/kernel/cpu/sh4a/setup-sh7723.c arch/sh/kernel/cpu/sh4a/setup-sh7724.c arch/sh/kernel/cpu/sh4a/setup-sh7763.c arch/sh/kernel/cpu/sh4a/setup-sh7770.c arch/sh/kernel/cpu/sh4a/setup-sh7780.c arch/sh/kernel/cpu/sh4a/setup-sh7785.c arch/sh/kernel/cpu/sh4a/setup-sh7786.c arch/sh/kernel/cpu/sh4a/setup-shx3.c arch/sh/kernel/cpu/sh5/setup-sh5.c drivers/serial/sh-sci.c drivers/serial/sh-sci.h include/linux/serial_sci.h
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1307
1 files changed, 966 insertions, 341 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 30d5093a099d..ff7e15872398 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/compiler.h> 25#include <linux/compiler.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
@@ -48,11 +49,31 @@
48#include <linux/page_cgroup.h> 49#include <linux/page_cgroup.h>
49#include <linux/debugobjects.h> 50#include <linux/debugobjects.h>
50#include <linux/kmemleak.h> 51#include <linux/kmemleak.h>
52#include <linux/memory.h>
53#include <linux/compaction.h>
54#include <trace/events/kmem.h>
55#include <linux/ftrace_event.h>
51 56
52#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
53#include <asm/div64.h> 58#include <asm/div64.h>
54#include "internal.h" 59#include "internal.h"
55 60
61#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
62DEFINE_PER_CPU(int, numa_node);
63EXPORT_PER_CPU_SYMBOL(numa_node);
64#endif
65
66#ifdef CONFIG_HAVE_MEMORYLESS_NODES
67/*
68 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
69 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
70 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
71 * defined in <linux/topology.h>.
72 */
73DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
74EXPORT_PER_CPU_SYMBOL(_numa_mem_);
75#endif
76
56/* 77/*
57 * Array of node states. 78 * Array of node states.
58 */ 79 */
@@ -71,10 +92,39 @@ EXPORT_SYMBOL(node_states);
71 92
72unsigned long totalram_pages __read_mostly; 93unsigned long totalram_pages __read_mostly;
73unsigned long totalreserve_pages __read_mostly; 94unsigned long totalreserve_pages __read_mostly;
74unsigned long highest_memmap_pfn __read_mostly;
75int percpu_pagelist_fraction; 95int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 96gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
77 97
98#ifdef CONFIG_PM_SLEEP
99/*
100 * The following functions are used by the suspend/hibernate code to temporarily
101 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
102 * while devices are suspended. To avoid races with the suspend/hibernate code,
103 * they should always be called with pm_mutex held (gfp_allowed_mask also should
104 * only be modified with pm_mutex held, unless the suspend/hibernate code is
105 * guaranteed not to run in parallel with that modification).
106 */
107
108static gfp_t saved_gfp_mask;
109
110void pm_restore_gfp_mask(void)
111{
112 WARN_ON(!mutex_is_locked(&pm_mutex));
113 if (saved_gfp_mask) {
114 gfp_allowed_mask = saved_gfp_mask;
115 saved_gfp_mask = 0;
116 }
117}
118
119void pm_restrict_gfp_mask(void)
120{
121 WARN_ON(!mutex_is_locked(&pm_mutex));
122 WARN_ON(saved_gfp_mask);
123 saved_gfp_mask = gfp_allowed_mask;
124 gfp_allowed_mask &= ~GFP_IOFS;
125}
126#endif /* CONFIG_PM_SLEEP */
127
78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 128#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
79int pageblock_order __read_mostly; 129int pageblock_order __read_mostly;
80#endif 130#endif
@@ -123,8 +173,8 @@ static char * const zone_names[MAX_NR_ZONES] = {
123 173
124int min_free_kbytes = 1024; 174int min_free_kbytes = 1024;
125 175
126unsigned long __meminitdata nr_kernel_pages; 176static unsigned long __meminitdata nr_kernel_pages;
127unsigned long __meminitdata nr_all_pages; 177static unsigned long __meminitdata nr_all_pages;
128static unsigned long __meminitdata dma_reserve; 178static unsigned long __meminitdata dma_reserve;
129 179
130#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 180#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@ -234,6 +284,12 @@ static void bad_page(struct page *page)
234 static unsigned long nr_shown; 284 static unsigned long nr_shown;
235 static unsigned long nr_unshown; 285 static unsigned long nr_unshown;
236 286
287 /* Don't complain about poisoned pages */
288 if (PageHWPoison(page)) {
289 __ClearPageBuddy(page);
290 return;
291 }
292
237 /* 293 /*
238 * Allow a burst of 60 reports, then keep quiet for that minute; 294 * Allow a burst of 60 reports, then keep quiet for that minute;
239 * or allow a steady drip of one report per second. 295 * or allow a steady drip of one report per second.
@@ -256,10 +312,7 @@ static void bad_page(struct page *page)
256 312
257 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", 313 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
258 current->comm, page_to_pfn(page)); 314 current->comm, page_to_pfn(page));
259 printk(KERN_ALERT 315 dump_page(page);
260 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
261 page, (void *)page->flags, page_count(page),
262 page_mapcount(page), page->mapping, page->index);
263 316
264 dump_stack(); 317 dump_stack();
265out: 318out:
@@ -445,6 +498,8 @@ static inline void __free_one_page(struct page *page,
445 int migratetype) 498 int migratetype)
446{ 499{
447 unsigned long page_idx; 500 unsigned long page_idx;
501 unsigned long combined_idx;
502 struct page *buddy;
448 503
449 if (unlikely(PageCompound(page))) 504 if (unlikely(PageCompound(page)))
450 if (unlikely(destroy_compound_page(page, order))) 505 if (unlikely(destroy_compound_page(page, order)))
@@ -458,9 +513,6 @@ static inline void __free_one_page(struct page *page,
458 VM_BUG_ON(bad_range(zone, page)); 513 VM_BUG_ON(bad_range(zone, page));
459 514
460 while (order < MAX_ORDER-1) { 515 while (order < MAX_ORDER-1) {
461 unsigned long combined_idx;
462 struct page *buddy;
463
464 buddy = __page_find_buddy(page, page_idx, order); 516 buddy = __page_find_buddy(page, page_idx, order);
465 if (!page_is_buddy(page, buddy, order)) 517 if (!page_is_buddy(page, buddy, order))
466 break; 518 break;
@@ -475,12 +527,32 @@ static inline void __free_one_page(struct page *page,
475 order++; 527 order++;
476 } 528 }
477 set_page_order(page, order); 529 set_page_order(page, order);
478 list_add(&page->lru, 530
479 &zone->free_area[order].free_list[migratetype]); 531 /*
532 * If this is not the largest possible page, check if the buddy
533 * of the next-highest order is free. If it is, it's possible
534 * that pages are being freed that will coalesce soon. In case,
535 * that is happening, add the free page to the tail of the list
536 * so it's less likely to be used soon and more likely to be merged
537 * as a higher order page
538 */
539 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
540 struct page *higher_page, *higher_buddy;
541 combined_idx = __find_combined_index(page_idx, order);
542 higher_page = page + combined_idx - page_idx;
543 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
544 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
545 list_add_tail(&page->lru,
546 &zone->free_area[order].free_list[migratetype]);
547 goto out;
548 }
549 }
550
551 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
552out:
480 zone->free_area[order].nr_free++; 553 zone->free_area[order].nr_free++;
481} 554}
482 555
483#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
484/* 556/*
485 * free_page_mlock() -- clean up attempts to free and mlocked() page. 557 * free_page_mlock() -- clean up attempts to free and mlocked() page.
486 * Page should not be on lru, so no need to fix that up. 558 * Page should not be on lru, so no need to fix that up.
@@ -491,9 +563,6 @@ static inline void free_page_mlock(struct page *page)
491 __dec_zone_page_state(page, NR_MLOCK); 563 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED); 564 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493} 565}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497 566
498static inline int free_pages_check(struct page *page) 567static inline int free_pages_check(struct page *page)
499{ 568{
@@ -510,7 +579,7 @@ static inline int free_pages_check(struct page *page)
510} 579}
511 580
512/* 581/*
513 * Frees a list of pages. 582 * Frees a number of pages from the PCP lists
514 * Assumes all pages on list are in same zone, and of same order. 583 * Assumes all pages on list are in same zone, and of same order.
515 * count is the number of pages to free. 584 * count is the number of pages to free.
516 * 585 *
@@ -520,23 +589,45 @@ static inline int free_pages_check(struct page *page)
520 * And clear the zone's pages_scanned counter, to hold off the "all pages are 589 * And clear the zone's pages_scanned counter, to hold off the "all pages are
521 * pinned" detection logic. 590 * pinned" detection logic.
522 */ 591 */
523static void free_pages_bulk(struct zone *zone, int count, 592static void free_pcppages_bulk(struct zone *zone, int count,
524 struct list_head *list, int order) 593 struct per_cpu_pages *pcp)
525{ 594{
595 int migratetype = 0;
596 int batch_free = 0;
597 int to_free = count;
598
526 spin_lock(&zone->lock); 599 spin_lock(&zone->lock);
527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 600 zone->all_unreclaimable = 0;
528 zone->pages_scanned = 0; 601 zone->pages_scanned = 0;
529 602
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); 603 while (to_free) {
531 while (count--) {
532 struct page *page; 604 struct page *page;
605 struct list_head *list;
533 606
534 VM_BUG_ON(list_empty(list)); 607 /*
535 page = list_entry(list->prev, struct page, lru); 608 * Remove pages from lists in a round-robin fashion. A
536 /* have to delete it as __free_one_page list manipulates */ 609 * batch_free count is maintained that is incremented when an
537 list_del(&page->lru); 610 * empty list is encountered. This is so more pages are freed
538 __free_one_page(page, zone, order, page_private(page)); 611 * off fuller lists instead of spinning excessively around empty
612 * lists
613 */
614 do {
615 batch_free++;
616 if (++migratetype == MIGRATE_PCPTYPES)
617 migratetype = 0;
618 list = &pcp->lists[migratetype];
619 } while (list_empty(list));
620
621 do {
622 page = list_entry(list->prev, struct page, lru);
623 /* must delete as __free_one_page list manipulates */
624 list_del(&page->lru);
625 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
626 __free_one_page(page, zone, 0, page_private(page));
627 trace_mm_page_pcpu_drain(page, 0, page_private(page));
628 } while (--to_free && --batch_free && !list_empty(list));
539 } 629 }
630 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
540 spin_unlock(&zone->lock); 631 spin_unlock(&zone->lock);
541} 632}
542 633
@@ -544,27 +635,31 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype) 635 int migratetype)
545{ 636{
546 spin_lock(&zone->lock); 637 spin_lock(&zone->lock);
547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 638 zone->all_unreclaimable = 0;
548 zone->pages_scanned = 0; 639 zone->pages_scanned = 0;
549 640
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype); 641 __free_one_page(page, zone, order, migratetype);
642 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
552 spin_unlock(&zone->lock); 643 spin_unlock(&zone->lock);
553} 644}
554 645
555static void __free_pages_ok(struct page *page, unsigned int order) 646static bool free_pages_prepare(struct page *page, unsigned int order)
556{ 647{
557 unsigned long flags;
558 int i; 648 int i;
559 int bad = 0; 649 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page);
561 650
651 trace_mm_page_free_direct(page, order);
562 kmemcheck_free_shadow(page, order); 652 kmemcheck_free_shadow(page, order);
563 653
564 for (i = 0 ; i < (1 << order) ; ++i) 654 for (i = 0; i < (1 << order); i++) {
565 bad += free_pages_check(page + i); 655 struct page *pg = page + i;
656
657 if (PageAnon(pg))
658 pg->mapping = NULL;
659 bad += free_pages_check(pg);
660 }
566 if (bad) 661 if (bad)
567 return; 662 return false;
568 663
569 if (!PageHighMem(page)) { 664 if (!PageHighMem(page)) {
570 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 665 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -574,6 +669,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
574 arch_free_page(page, order); 669 arch_free_page(page, order);
575 kernel_map_pages(page, 1 << order, 0); 670 kernel_map_pages(page, 1 << order, 0);
576 671
672 return true;
673}
674
675static void __free_pages_ok(struct page *page, unsigned int order)
676{
677 unsigned long flags;
678 int wasMlocked = __TestClearPageMlocked(page);
679
680 if (!free_pages_prepare(page, order))
681 return;
682
577 local_irq_save(flags); 683 local_irq_save(flags);
578 if (unlikely(wasMlocked)) 684 if (unlikely(wasMlocked))
579 free_page_mlock(page); 685 free_page_mlock(page);
@@ -646,7 +752,7 @@ static inline void expand(struct zone *zone, struct page *page,
646/* 752/*
647 * This page is about to be returned from the page allocator 753 * This page is about to be returned from the page allocator
648 */ 754 */
649static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 755static inline int check_new_page(struct page *page)
650{ 756{
651 if (unlikely(page_mapcount(page) | 757 if (unlikely(page_mapcount(page) |
652 (page->mapping != NULL) | 758 (page->mapping != NULL) |
@@ -655,6 +761,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
655 bad_page(page); 761 bad_page(page);
656 return 1; 762 return 1;
657 } 763 }
764 return 0;
765}
766
767static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
768{
769 int i;
770
771 for (i = 0; i < (1 << order); i++) {
772 struct page *p = page + i;
773 if (unlikely(check_new_page(p)))
774 return 1;
775 }
658 776
659 set_page_private(page, 0); 777 set_page_private(page, 0);
660 set_page_refcounted(page); 778 set_page_refcounted(page);
@@ -783,6 +901,17 @@ static int move_freepages_block(struct zone *zone, struct page *page,
783 return move_freepages(zone, start_page, end_page, migratetype); 901 return move_freepages(zone, start_page, end_page, migratetype);
784} 902}
785 903
904static void change_pageblock_range(struct page *pageblock_page,
905 int start_order, int migratetype)
906{
907 int nr_pageblocks = 1 << (start_order - pageblock_order);
908
909 while (nr_pageblocks--) {
910 set_pageblock_migratetype(pageblock_page, migratetype);
911 pageblock_page += pageblock_nr_pages;
912 }
913}
914
786/* Remove an element from the buddy allocator from the fallback list */ 915/* Remove an element from the buddy allocator from the fallback list */
787static inline struct page * 916static inline struct page *
788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 917__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -817,13 +946,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
817 * agressive about taking ownership of free pages 946 * agressive about taking ownership of free pages
818 */ 947 */
819 if (unlikely(current_order >= (pageblock_order >> 1)) || 948 if (unlikely(current_order >= (pageblock_order >> 1)) ||
820 start_migratetype == MIGRATE_RECLAIMABLE) { 949 start_migratetype == MIGRATE_RECLAIMABLE ||
950 page_group_by_mobility_disabled) {
821 unsigned long pages; 951 unsigned long pages;
822 pages = move_freepages_block(zone, page, 952 pages = move_freepages_block(zone, page,
823 start_migratetype); 953 start_migratetype);
824 954
825 /* Claim the whole block if over half of it is free */ 955 /* Claim the whole block if over half of it is free */
826 if (pages >= (1 << (pageblock_order-1))) 956 if (pages >= (1 << (pageblock_order-1)) ||
957 page_group_by_mobility_disabled)
827 set_pageblock_migratetype(page, 958 set_pageblock_migratetype(page,
828 start_migratetype); 959 start_migratetype);
829 960
@@ -834,11 +965,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
834 list_del(&page->lru); 965 list_del(&page->lru);
835 rmv_page_order(page); 966 rmv_page_order(page);
836 967
837 if (current_order == pageblock_order) 968 /* Take ownership for orders >= pageblock_order */
838 set_pageblock_migratetype(page, 969 if (current_order >= pageblock_order)
970 change_pageblock_range(page, current_order,
839 start_migratetype); 971 start_migratetype);
840 972
841 expand(zone, page, order, current_order, area, migratetype); 973 expand(zone, page, order, current_order, area, migratetype);
974
975 trace_mm_page_alloc_extfrag(page, order, current_order,
976 start_migratetype, migratetype);
977
842 return page; 978 return page;
843 } 979 }
844 } 980 }
@@ -872,6 +1008,7 @@ retry_reserve:
872 } 1008 }
873 } 1009 }
874 1010
1011 trace_mm_page_alloc_zone_locked(page, order, migratetype);
875 return page; 1012 return page;
876} 1013}
877 1014
@@ -882,7 +1019,7 @@ retry_reserve:
882 */ 1019 */
883static int rmqueue_bulk(struct zone *zone, unsigned int order, 1020static int rmqueue_bulk(struct zone *zone, unsigned int order,
884 unsigned long count, struct list_head *list, 1021 unsigned long count, struct list_head *list,
885 int migratetype) 1022 int migratetype, int cold)
886{ 1023{
887 int i; 1024 int i;
888 1025
@@ -901,7 +1038,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
901 * merge IO requests if the physical pages are ordered 1038 * merge IO requests if the physical pages are ordered
902 * properly. 1039 * properly.
903 */ 1040 */
904 list_add(&page->lru, list); 1041 if (likely(cold == 0))
1042 list_add(&page->lru, list);
1043 else
1044 list_add_tail(&page->lru, list);
905 set_page_private(page, migratetype); 1045 set_page_private(page, migratetype);
906 list = &page->lru; 1046 list = &page->lru;
907 } 1047 }
@@ -929,7 +1069,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
929 to_drain = pcp->batch; 1069 to_drain = pcp->batch;
930 else 1070 else
931 to_drain = pcp->count; 1071 to_drain = pcp->count;
932 free_pages_bulk(zone, to_drain, &pcp->list, 0); 1072 free_pcppages_bulk(zone, to_drain, pcp);
933 pcp->count -= to_drain; 1073 pcp->count -= to_drain;
934 local_irq_restore(flags); 1074 local_irq_restore(flags);
935} 1075}
@@ -951,11 +1091,11 @@ static void drain_pages(unsigned int cpu)
951 struct per_cpu_pageset *pset; 1091 struct per_cpu_pageset *pset;
952 struct per_cpu_pages *pcp; 1092 struct per_cpu_pages *pcp;
953 1093
954 pset = zone_pcp(zone, cpu); 1094 local_irq_save(flags);
1095 pset = per_cpu_ptr(zone->pageset, cpu);
955 1096
956 pcp = &pset->pcp; 1097 pcp = &pset->pcp;
957 local_irq_save(flags); 1098 free_pcppages_bulk(zone, pcp->count, pcp);
958 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
959 pcp->count = 0; 1099 pcp->count = 0;
960 local_irq_restore(flags); 1100 local_irq_restore(flags);
961 } 1101 }
@@ -1015,56 +1155,54 @@ void mark_free_pages(struct zone *zone)
1015 1155
1016/* 1156/*
1017 * Free a 0-order page 1157 * Free a 0-order page
1158 * cold == 1 ? free a cold page : free a hot page
1018 */ 1159 */
1019static void free_hot_cold_page(struct page *page, int cold) 1160void free_hot_cold_page(struct page *page, int cold)
1020{ 1161{
1021 struct zone *zone = page_zone(page); 1162 struct zone *zone = page_zone(page);
1022 struct per_cpu_pages *pcp; 1163 struct per_cpu_pages *pcp;
1023 unsigned long flags; 1164 unsigned long flags;
1024 int wasMlocked = TestClearPageMlocked(page); 1165 int migratetype;
1025 1166 int wasMlocked = __TestClearPageMlocked(page);
1026 kmemcheck_free_shadow(page, 0);
1027 1167
1028 if (PageAnon(page)) 1168 if (!free_pages_prepare(page, 0))
1029 page->mapping = NULL;
1030 if (free_pages_check(page))
1031 return; 1169 return;
1032 1170
1033 if (!PageHighMem(page)) { 1171 migratetype = get_pageblock_migratetype(page);
1034 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 1172 set_page_private(page, migratetype);
1035 debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1036 }
1037 arch_free_page(page, 0);
1038 kernel_map_pages(page, 1, 0);
1039
1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1042 local_irq_save(flags); 1173 local_irq_save(flags);
1043 if (unlikely(wasMlocked)) 1174 if (unlikely(wasMlocked))
1044 free_page_mlock(page); 1175 free_page_mlock(page);
1045 __count_vm_event(PGFREE); 1176 __count_vm_event(PGFREE);
1046 1177
1178 /*
1179 * We only track unmovable, reclaimable and movable on pcp lists.
1180 * Free ISOLATE pages back to the allocator because they are being
1181 * offlined but treat RESERVE as movable pages so we can get those
1182 * areas back if necessary. Otherwise, we may have to free
1183 * excessively into the page allocator
1184 */
1185 if (migratetype >= MIGRATE_PCPTYPES) {
1186 if (unlikely(migratetype == MIGRATE_ISOLATE)) {
1187 free_one_page(zone, page, 0, migratetype);
1188 goto out;
1189 }
1190 migratetype = MIGRATE_MOVABLE;
1191 }
1192
1193 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1047 if (cold) 1194 if (cold)
1048 list_add_tail(&page->lru, &pcp->list); 1195 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1049 else 1196 else
1050 list_add(&page->lru, &pcp->list); 1197 list_add(&page->lru, &pcp->lists[migratetype]);
1051 pcp->count++; 1198 pcp->count++;
1052 if (pcp->count >= pcp->high) { 1199 if (pcp->count >= pcp->high) {
1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1200 free_pcppages_bulk(zone, pcp->batch, pcp);
1054 pcp->count -= pcp->batch; 1201 pcp->count -= pcp->batch;
1055 } 1202 }
1056 local_irq_restore(flags);
1057 put_cpu();
1058}
1059 1203
1060void free_hot_page(struct page *page) 1204out:
1061{ 1205 local_irq_restore(flags);
1062 free_hot_cold_page(page, 0);
1063}
1064
1065void free_cold_page(struct page *page)
1066{
1067 free_hot_cold_page(page, 1);
1068} 1206}
1069 1207
1070/* 1208/*
@@ -1096,6 +1234,51 @@ void split_page(struct page *page, unsigned int order)
1096} 1234}
1097 1235
1098/* 1236/*
1237 * Similar to split_page except the page is already free. As this is only
1238 * being used for migration, the migratetype of the block also changes.
1239 * As this is called with interrupts disabled, the caller is responsible
1240 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1241 * are enabled.
1242 *
1243 * Note: this is probably too low level an operation for use in drivers.
1244 * Please consult with lkml before using this in your driver.
1245 */
1246int split_free_page(struct page *page)
1247{
1248 unsigned int order;
1249 unsigned long watermark;
1250 struct zone *zone;
1251
1252 BUG_ON(!PageBuddy(page));
1253
1254 zone = page_zone(page);
1255 order = page_order(page);
1256
1257 /* Obey watermarks as if the page was being allocated */
1258 watermark = low_wmark_pages(zone) + (1 << order);
1259 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1260 return 0;
1261
1262 /* Remove page from free list */
1263 list_del(&page->lru);
1264 zone->free_area[order].nr_free--;
1265 rmv_page_order(page);
1266 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1267
1268 /* Split into individual pages */
1269 set_page_refcounted(page);
1270 split_page(page, order);
1271
1272 if (order >= pageblock_order - 1) {
1273 struct page *endpage = page + (1 << order) - 1;
1274 for (; page < endpage; page += pageblock_nr_pages)
1275 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1276 }
1277
1278 return 1 << order;
1279}
1280
1281/*
1099 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1282 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1283 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1101 * or two. 1284 * or two.
@@ -1108,39 +1291,27 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1108 unsigned long flags; 1291 unsigned long flags;
1109 struct page *page; 1292 struct page *page;
1110 int cold = !!(gfp_flags & __GFP_COLD); 1293 int cold = !!(gfp_flags & __GFP_COLD);
1111 int cpu;
1112 1294
1113again: 1295again:
1114 cpu = get_cpu();
1115 if (likely(order == 0)) { 1296 if (likely(order == 0)) {
1116 struct per_cpu_pages *pcp; 1297 struct per_cpu_pages *pcp;
1298 struct list_head *list;
1117 1299
1118 pcp = &zone_pcp(zone, cpu)->pcp;
1119 local_irq_save(flags); 1300 local_irq_save(flags);
1120 if (!pcp->count) { 1301 pcp = &this_cpu_ptr(zone->pageset)->pcp;
1121 pcp->count = rmqueue_bulk(zone, 0, 1302 list = &pcp->lists[migratetype];
1122 pcp->batch, &pcp->list, migratetype); 1303 if (list_empty(list)) {
1123 if (unlikely(!pcp->count)) 1304 pcp->count += rmqueue_bulk(zone, 0,
1305 pcp->batch, list,
1306 migratetype, cold);
1307 if (unlikely(list_empty(list)))
1124 goto failed; 1308 goto failed;
1125 } 1309 }
1126 1310
1127 /* Find a page of the appropriate migrate type */ 1311 if (cold)
1128 if (cold) { 1312 page = list_entry(list->prev, struct page, lru);
1129 list_for_each_entry_reverse(page, &pcp->list, lru) 1313 else
1130 if (page_private(page) == migratetype) 1314 page = list_entry(list->next, struct page, lru);
1131 break;
1132 } else {
1133 list_for_each_entry(page, &pcp->list, lru)
1134 if (page_private(page) == migratetype)
1135 break;
1136 }
1137
1138 /* Allocate more to the pcp list if necessary */
1139 if (unlikely(&page->lru == &pcp->list)) {
1140 pcp->count += rmqueue_bulk(zone, 0,
1141 pcp->batch, &pcp->list, migratetype);
1142 page = list_entry(pcp->list.next, struct page, lru);
1143 }
1144 1315
1145 list_del(&page->lru); 1316 list_del(&page->lru);
1146 pcp->count--; 1317 pcp->count--;
@@ -1153,23 +1324,22 @@ again:
1153 * properly detect and handle allocation failures. 1324 * properly detect and handle allocation failures.
1154 * 1325 *
1155 * We most definitely don't want callers attempting to 1326 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with 1327 * allocate greater than order-1 page units with
1157 * __GFP_NOFAIL. 1328 * __GFP_NOFAIL.
1158 */ 1329 */
1159 WARN_ON_ONCE(order > 0); 1330 WARN_ON_ONCE(order > 1);
1160 } 1331 }
1161 spin_lock_irqsave(&zone->lock, flags); 1332 spin_lock_irqsave(&zone->lock, flags);
1162 page = __rmqueue(zone, order, migratetype); 1333 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1164 spin_unlock(&zone->lock); 1334 spin_unlock(&zone->lock);
1165 if (!page) 1335 if (!page)
1166 goto failed; 1336 goto failed;
1337 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1167 } 1338 }
1168 1339
1169 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1340 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1170 zone_statistics(preferred_zone, zone); 1341 zone_statistics(preferred_zone, zone);
1171 local_irq_restore(flags); 1342 local_irq_restore(flags);
1172 put_cpu();
1173 1343
1174 VM_BUG_ON(bad_range(zone, page)); 1344 VM_BUG_ON(bad_range(zone, page));
1175 if (prep_new_page(page, order, gfp_flags)) 1345 if (prep_new_page(page, order, gfp_flags))
@@ -1178,7 +1348,6 @@ again:
1178 1348
1179failed: 1349failed:
1180 local_irq_restore(flags); 1350 local_irq_restore(flags);
1181 put_cpu();
1182 return NULL; 1351 return NULL;
1183} 1352}
1184 1353
@@ -1299,7 +1468,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1299{ 1468{
1300 /* free_pages my go negative - that's OK */ 1469 /* free_pages my go negative - that's OK */
1301 long min = mark; 1470 long min = mark;
1302 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1303 int o; 1472 int o;
1304 1473
1305 if (alloc_flags & ALLOC_HIGH) 1474 if (alloc_flags & ALLOC_HIGH)
@@ -1576,7 +1745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1576 struct page *page; 1745 struct page *page;
1577 1746
1578 /* Acquire the OOM killer lock for the zones in zonelist */ 1747 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) { 1748 if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1); 1749 schedule_timeout_uninterruptible(1);
1581 return NULL; 1750 return NULL;
1582 } 1751 }
@@ -1593,18 +1762,87 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1593 if (page) 1762 if (page)
1594 goto out; 1763 goto out;
1595 1764
1596 /* The OOM killer will not help higher order allocs */ 1765 if (!(gfp_mask & __GFP_NOFAIL)) {
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) 1766 /* The OOM killer will not help higher order allocs */
1598 goto out; 1767 if (order > PAGE_ALLOC_COSTLY_ORDER)
1599 1768 goto out;
1769 /* The OOM killer does not needlessly kill tasks for lowmem */
1770 if (high_zoneidx < ZONE_NORMAL)
1771 goto out;
1772 /*
1773 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
1774 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
1775 * The caller should handle page allocation failure by itself if
1776 * it specifies __GFP_THISNODE.
1777 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
1778 */
1779 if (gfp_mask & __GFP_THISNODE)
1780 goto out;
1781 }
1600 /* Exhausted what can be done so it's blamo time */ 1782 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order); 1783 out_of_memory(zonelist, gfp_mask, order, nodemask);
1602 1784
1603out: 1785out:
1604 clear_zonelist_oom(zonelist, gfp_mask); 1786 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page; 1787 return page;
1606} 1788}
1607 1789
1790#ifdef CONFIG_COMPACTION
1791/* Try memory compaction for high-order allocations before reclaim */
1792static struct page *
1793__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1794 struct zonelist *zonelist, enum zone_type high_zoneidx,
1795 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1796 int migratetype, unsigned long *did_some_progress)
1797{
1798 struct page *page;
1799
1800 if (!order || compaction_deferred(preferred_zone))
1801 return NULL;
1802
1803 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1804 nodemask);
1805 if (*did_some_progress != COMPACT_SKIPPED) {
1806
1807 /* Page migration frees to the PCP lists but we want merging */
1808 drain_pages(get_cpu());
1809 put_cpu();
1810
1811 page = get_page_from_freelist(gfp_mask, nodemask,
1812 order, zonelist, high_zoneidx,
1813 alloc_flags, preferred_zone,
1814 migratetype);
1815 if (page) {
1816 preferred_zone->compact_considered = 0;
1817 preferred_zone->compact_defer_shift = 0;
1818 count_vm_event(COMPACTSUCCESS);
1819 return page;
1820 }
1821
1822 /*
1823 * It's bad if compaction run occurs and fails.
1824 * The most likely reason is that pages exist,
1825 * but not enough to satisfy watermarks.
1826 */
1827 count_vm_event(COMPACTFAIL);
1828 defer_compaction(preferred_zone);
1829
1830 cond_resched();
1831 }
1832
1833 return NULL;
1834}
1835#else
1836static inline struct page *
1837__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1838 struct zonelist *zonelist, enum zone_type high_zoneidx,
1839 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1840 int migratetype, unsigned long *did_some_progress)
1841{
1842 return NULL;
1843}
1844#endif /* CONFIG_COMPACTION */
1845
1608/* The really slow allocator path where we enter direct reclaim */ 1846/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page * 1847static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 1848__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1615,15 +1853,12 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1615 struct page *page = NULL; 1853 struct page *page = NULL;
1616 struct reclaim_state reclaim_state; 1854 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current; 1855 struct task_struct *p = current;
1856 bool drained = false;
1618 1857
1619 cond_resched(); 1858 cond_resched();
1620 1859
1621 /* We now go into synchronous reclaim */ 1860 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump(); 1861 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC; 1862 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask); 1863 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0; 1864 reclaim_state.reclaimed_slab = 0;
@@ -1637,14 +1872,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1637 1872
1638 cond_resched(); 1873 cond_resched();
1639 1874
1640 if (order != 0) 1875 if (unlikely(!(*did_some_progress)))
1641 drain_all_pages(); 1876 return NULL;
1642 1877
1643 if (likely(*did_some_progress)) 1878retry:
1644 page = get_page_from_freelist(gfp_mask, nodemask, order, 1879 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx, 1880 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone, 1881 alloc_flags, preferred_zone,
1647 migratetype); 1882 migratetype);
1883
1884 /*
1885 * If an allocation failed after direct reclaim, it could be because
1886 * pages are pinned on the per-cpu lists. Drain them and try again
1887 */
1888 if (!page && !drained) {
1889 drain_all_pages();
1890 drained = true;
1891 goto retry;
1892 }
1893
1648 return page; 1894 return page;
1649} 1895}
1650 1896
@@ -1666,7 +1912,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1666 preferred_zone, migratetype); 1912 preferred_zone, migratetype);
1667 1913
1668 if (!page && gfp_mask & __GFP_NOFAIL) 1914 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50); 1915 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1916 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671 1917
1672 return page; 1918 return page;
@@ -1691,7 +1937,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1691 const gfp_t wait = gfp_mask & __GFP_WAIT; 1937 const gfp_t wait = gfp_mask & __GFP_WAIT;
1692 1938
1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 1939 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 1940 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1695 1941
1696 /* 1942 /*
1697 * The caller may dip into page reserves a bit more if the caller 1943 * The caller may dip into page reserves a bit more if the caller
@@ -1699,7 +1945,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1945 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1946 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */ 1947 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH); 1948 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1703 1949
1704 if (!wait) { 1950 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER; 1951 alloc_flags |= ALLOC_HARDER;
@@ -1708,7 +1954,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1954 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1709 */ 1955 */
1710 alloc_flags &= ~ALLOC_CPUSET; 1956 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p))) 1957 } else if (unlikely(rt_task(p)) && !in_interrupt())
1712 alloc_flags |= ALLOC_HARDER; 1958 alloc_flags |= ALLOC_HARDER;
1713 1959
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1960 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1740,8 +1986,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1740 * be using allocators in order of preference for an area that is 1986 * be using allocators in order of preference for an area that is
1741 * too large. 1987 * too large.
1742 */ 1988 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER)) 1989 if (order >= MAX_ORDER) {
1990 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1744 return NULL; 1991 return NULL;
1992 }
1745 1993
1746 /* 1994 /*
1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1995 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1754,6 +2002,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2002 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1755 goto nopage; 2003 goto nopage;
1756 2004
2005restart:
1757 wake_all_kswapd(order, zonelist, high_zoneidx); 2006 wake_all_kswapd(order, zonelist, high_zoneidx);
1758 2007
1759 /* 2008 /*
@@ -1763,7 +2012,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1763 */ 2012 */
1764 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2013 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1765 2014
1766restart:
1767 /* This is the last chance, in general, before the goto nopage. */ 2015 /* This is the last chance, in general, before the goto nopage. */
1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2016 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2017 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1789,6 +2037,19 @@ rebalance:
1789 if (p->flags & PF_MEMALLOC) 2037 if (p->flags & PF_MEMALLOC)
1790 goto nopage; 2038 goto nopage;
1791 2039
2040 /* Avoid allocations with no watermarks from looping endlessly */
2041 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2042 goto nopage;
2043
2044 /* Try direct compaction */
2045 page = __alloc_pages_direct_compact(gfp_mask, order,
2046 zonelist, high_zoneidx,
2047 nodemask,
2048 alloc_flags, preferred_zone,
2049 migratetype, &did_some_progress);
2050 if (page)
2051 goto got_pg;
2052
1792 /* Try direct reclaim and then allocating */ 2053 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2054 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx, 2055 zonelist, high_zoneidx,
@@ -1813,15 +2074,23 @@ rebalance:
1813 if (page) 2074 if (page)
1814 goto got_pg; 2075 goto got_pg;
1815 2076
1816 /* 2077 if (!(gfp_mask & __GFP_NOFAIL)) {
1817 * The OOM killer does not trigger for high-order 2078 /*
1818 * ~__GFP_NOFAIL allocations so if no progress is being 2079 * The oom killer is not called for high-order
1819 * made, there are no other options and retrying is 2080 * allocations that may fail, so if no progress
1820 * unlikely to help. 2081 * is being made, there are no other options and
1821 */ 2082 * retrying is unlikely to help.
1822 if (order > PAGE_ALLOC_COSTLY_ORDER && 2083 */
1823 !(gfp_mask & __GFP_NOFAIL)) 2084 if (order > PAGE_ALLOC_COSTLY_ORDER)
1824 goto nopage; 2085 goto nopage;
2086 /*
2087 * The oom killer is not called for lowmem
2088 * allocations to prevent needlessly killing
2089 * innocent tasks.
2090 */
2091 if (high_zoneidx < ZONE_NORMAL)
2092 goto nopage;
2093 }
1825 2094
1826 goto restart; 2095 goto restart;
1827 } 2096 }
@@ -1831,7 +2100,7 @@ rebalance:
1831 pages_reclaimed += did_some_progress; 2100 pages_reclaimed += did_some_progress;
1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2101 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1833 /* Wait for some write requests to complete then retry */ 2102 /* Wait for some write requests to complete then retry */
1834 congestion_wait(WRITE, HZ/50); 2103 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1835 goto rebalance; 2104 goto rebalance;
1836 } 2105 }
1837 2106
@@ -1880,10 +2149,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1880 if (unlikely(!zonelist->_zonerefs->zone)) 2149 if (unlikely(!zonelist->_zonerefs->zone))
1881 return NULL; 2150 return NULL;
1882 2151
2152 get_mems_allowed();
1883 /* The preferred zone is used for statistics later */ 2153 /* The preferred zone is used for statistics later */
1884 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2154 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1885 if (!preferred_zone) 2155 if (!preferred_zone) {
2156 put_mems_allowed();
1886 return NULL; 2157 return NULL;
2158 }
1887 2159
1888 /* First allocation attempt */ 2160 /* First allocation attempt */
1889 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2161 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1893,7 +2165,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1893 page = __alloc_pages_slowpath(gfp_mask, order, 2165 page = __alloc_pages_slowpath(gfp_mask, order,
1894 zonelist, high_zoneidx, nodemask, 2166 zonelist, high_zoneidx, nodemask,
1895 preferred_zone, migratetype); 2167 preferred_zone, migratetype);
2168 put_mems_allowed();
1896 2169
2170 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1897 return page; 2171 return page;
1898} 2172}
1899EXPORT_SYMBOL(__alloc_pages_nodemask); 2173EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -1903,46 +2177,42 @@ EXPORT_SYMBOL(__alloc_pages_nodemask);
1903 */ 2177 */
1904unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 2178unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1905{ 2179{
1906 struct page * page; 2180 struct page *page;
2181
2182 /*
2183 * __get_free_pages() returns a 32-bit address, which cannot represent
2184 * a highmem page
2185 */
2186 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
2187
1907 page = alloc_pages(gfp_mask, order); 2188 page = alloc_pages(gfp_mask, order);
1908 if (!page) 2189 if (!page)
1909 return 0; 2190 return 0;
1910 return (unsigned long) page_address(page); 2191 return (unsigned long) page_address(page);
1911} 2192}
1912
1913EXPORT_SYMBOL(__get_free_pages); 2193EXPORT_SYMBOL(__get_free_pages);
1914 2194
1915unsigned long get_zeroed_page(gfp_t gfp_mask) 2195unsigned long get_zeroed_page(gfp_t gfp_mask)
1916{ 2196{
1917 struct page * page; 2197 return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
1918
1919 /*
1920 * get_zeroed_page() returns a 32-bit address, which cannot represent
1921 * a highmem page
1922 */
1923 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1924
1925 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1926 if (page)
1927 return (unsigned long) page_address(page);
1928 return 0;
1929} 2198}
1930
1931EXPORT_SYMBOL(get_zeroed_page); 2199EXPORT_SYMBOL(get_zeroed_page);
1932 2200
1933void __pagevec_free(struct pagevec *pvec) 2201void __pagevec_free(struct pagevec *pvec)
1934{ 2202{
1935 int i = pagevec_count(pvec); 2203 int i = pagevec_count(pvec);
1936 2204
1937 while (--i >= 0) 2205 while (--i >= 0) {
2206 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
1938 free_hot_cold_page(pvec->pages[i], pvec->cold); 2207 free_hot_cold_page(pvec->pages[i], pvec->cold);
2208 }
1939} 2209}
1940 2210
1941void __free_pages(struct page *page, unsigned int order) 2211void __free_pages(struct page *page, unsigned int order)
1942{ 2212{
1943 if (put_page_testzero(page)) { 2213 if (put_page_testzero(page)) {
1944 if (order == 0) 2214 if (order == 0)
1945 free_hot_page(page); 2215 free_hot_cold_page(page, 0);
1946 else 2216 else
1947 __free_pages_ok(page, order); 2217 __free_pages_ok(page, order);
1948 } 2218 }
@@ -1983,7 +2253,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1983 unsigned long alloc_end = addr + (PAGE_SIZE << order); 2253 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1984 unsigned long used = addr + PAGE_ALIGN(size); 2254 unsigned long used = addr + PAGE_ALIGN(size);
1985 2255
1986 split_page(virt_to_page(addr), order); 2256 split_page(virt_to_page((void *)addr), order);
1987 while (used < alloc_end) { 2257 while (used < alloc_end) {
1988 free_page(used); 2258 free_page(used);
1989 used += PAGE_SIZE; 2259 used += PAGE_SIZE;
@@ -2107,7 +2377,7 @@ void show_free_areas(void)
2107 for_each_online_cpu(cpu) { 2377 for_each_online_cpu(cpu) {
2108 struct per_cpu_pageset *pageset; 2378 struct per_cpu_pageset *pageset;
2109 2379
2110 pageset = zone_pcp(zone, cpu); 2380 pageset = per_cpu_ptr(zone->pageset, cpu);
2111 2381
2112 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", 2382 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
2113 cpu, pageset->pcp.high, 2383 cpu, pageset->pcp.high,
@@ -2115,23 +2385,27 @@ void show_free_areas(void)
2115 } 2385 }
2116 } 2386 }
2117 2387
2118 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2388 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
2119 " inactive_file:%lu" 2389 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
2120 " unevictable:%lu" 2390 " unevictable:%lu"
2121 " dirty:%lu writeback:%lu unstable:%lu\n" 2391 " dirty:%lu writeback:%lu unstable:%lu\n"
2122 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2392 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2393 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
2123 global_page_state(NR_ACTIVE_ANON), 2394 global_page_state(NR_ACTIVE_ANON),
2124 global_page_state(NR_ACTIVE_FILE),
2125 global_page_state(NR_INACTIVE_ANON), 2395 global_page_state(NR_INACTIVE_ANON),
2396 global_page_state(NR_ISOLATED_ANON),
2397 global_page_state(NR_ACTIVE_FILE),
2126 global_page_state(NR_INACTIVE_FILE), 2398 global_page_state(NR_INACTIVE_FILE),
2399 global_page_state(NR_ISOLATED_FILE),
2127 global_page_state(NR_UNEVICTABLE), 2400 global_page_state(NR_UNEVICTABLE),
2128 global_page_state(NR_FILE_DIRTY), 2401 global_page_state(NR_FILE_DIRTY),
2129 global_page_state(NR_WRITEBACK), 2402 global_page_state(NR_WRITEBACK),
2130 global_page_state(NR_UNSTABLE_NFS), 2403 global_page_state(NR_UNSTABLE_NFS),
2131 global_page_state(NR_FREE_PAGES), 2404 global_page_state(NR_FREE_PAGES),
2132 global_page_state(NR_SLAB_RECLAIMABLE) + 2405 global_page_state(NR_SLAB_RECLAIMABLE),
2133 global_page_state(NR_SLAB_UNRECLAIMABLE), 2406 global_page_state(NR_SLAB_UNRECLAIMABLE),
2134 global_page_state(NR_FILE_MAPPED), 2407 global_page_state(NR_FILE_MAPPED),
2408 global_page_state(NR_SHMEM),
2135 global_page_state(NR_PAGETABLE), 2409 global_page_state(NR_PAGETABLE),
2136 global_page_state(NR_BOUNCE)); 2410 global_page_state(NR_BOUNCE));
2137 2411
@@ -2149,12 +2423,26 @@ void show_free_areas(void)
2149 " active_file:%lukB" 2423 " active_file:%lukB"
2150 " inactive_file:%lukB" 2424 " inactive_file:%lukB"
2151 " unevictable:%lukB" 2425 " unevictable:%lukB"
2426 " isolated(anon):%lukB"
2427 " isolated(file):%lukB"
2152 " present:%lukB" 2428 " present:%lukB"
2429 " mlocked:%lukB"
2430 " dirty:%lukB"
2431 " writeback:%lukB"
2432 " mapped:%lukB"
2433 " shmem:%lukB"
2434 " slab_reclaimable:%lukB"
2435 " slab_unreclaimable:%lukB"
2436 " kernel_stack:%lukB"
2437 " pagetables:%lukB"
2438 " unstable:%lukB"
2439 " bounce:%lukB"
2440 " writeback_tmp:%lukB"
2153 " pages_scanned:%lu" 2441 " pages_scanned:%lu"
2154 " all_unreclaimable? %s" 2442 " all_unreclaimable? %s"
2155 "\n", 2443 "\n",
2156 zone->name, 2444 zone->name,
2157 K(zone_page_state(zone, NR_FREE_PAGES)), 2445 K(zone_nr_free_pages(zone)),
2158 K(min_wmark_pages(zone)), 2446 K(min_wmark_pages(zone)),
2159 K(low_wmark_pages(zone)), 2447 K(low_wmark_pages(zone)),
2160 K(high_wmark_pages(zone)), 2448 K(high_wmark_pages(zone)),
@@ -2163,9 +2451,24 @@ void show_free_areas(void)
2163 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2451 K(zone_page_state(zone, NR_ACTIVE_FILE)),
2164 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2452 K(zone_page_state(zone, NR_INACTIVE_FILE)),
2165 K(zone_page_state(zone, NR_UNEVICTABLE)), 2453 K(zone_page_state(zone, NR_UNEVICTABLE)),
2454 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2455 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2166 K(zone->present_pages), 2456 K(zone->present_pages),
2457 K(zone_page_state(zone, NR_MLOCK)),
2458 K(zone_page_state(zone, NR_FILE_DIRTY)),
2459 K(zone_page_state(zone, NR_WRITEBACK)),
2460 K(zone_page_state(zone, NR_FILE_MAPPED)),
2461 K(zone_page_state(zone, NR_SHMEM)),
2462 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
2463 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
2464 zone_page_state(zone, NR_KERNEL_STACK) *
2465 THREAD_SIZE / 1024,
2466 K(zone_page_state(zone, NR_PAGETABLE)),
2467 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2468 K(zone_page_state(zone, NR_BOUNCE)),
2469 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2167 zone->pages_scanned, 2470 zone->pages_scanned,
2168 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2471 (zone->all_unreclaimable ? "yes" : "no")
2169 ); 2472 );
2170 printk("lowmem_reserve[]:"); 2473 printk("lowmem_reserve[]:");
2171 for (i = 0; i < MAX_NR_ZONES; i++) 2474 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2292,18 +2595,19 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order);
2292 * sysctl handler for numa_zonelist_order 2595 * sysctl handler for numa_zonelist_order
2293 */ 2596 */
2294int numa_zonelist_order_handler(ctl_table *table, int write, 2597int numa_zonelist_order_handler(ctl_table *table, int write,
2295 struct file *file, void __user *buffer, size_t *length, 2598 void __user *buffer, size_t *length,
2296 loff_t *ppos) 2599 loff_t *ppos)
2297{ 2600{
2298 char saved_string[NUMA_ZONELIST_ORDER_LEN]; 2601 char saved_string[NUMA_ZONELIST_ORDER_LEN];
2299 int ret; 2602 int ret;
2603 static DEFINE_MUTEX(zl_order_mutex);
2300 2604
2605 mutex_lock(&zl_order_mutex);
2301 if (write) 2606 if (write)
2302 strncpy(saved_string, (char*)table->data, 2607 strcpy(saved_string, (char*)table->data);
2303 NUMA_ZONELIST_ORDER_LEN); 2608 ret = proc_dostring(table, write, buffer, length, ppos);
2304 ret = proc_dostring(table, write, file, buffer, length, ppos);
2305 if (ret) 2609 if (ret)
2306 return ret; 2610 goto out;
2307 if (write) { 2611 if (write) {
2308 int oldval = user_zonelist_order; 2612 int oldval = user_zonelist_order;
2309 if (__parse_numa_zonelist_order((char*)table->data)) { 2613 if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2313,10 +2617,15 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2313 strncpy((char*)table->data, saved_string, 2617 strncpy((char*)table->data, saved_string,
2314 NUMA_ZONELIST_ORDER_LEN); 2618 NUMA_ZONELIST_ORDER_LEN);
2315 user_zonelist_order = oldval; 2619 user_zonelist_order = oldval;
2316 } else if (oldval != user_zonelist_order) 2620 } else if (oldval != user_zonelist_order) {
2317 build_all_zonelists(); 2621 mutex_lock(&zonelists_mutex);
2622 build_all_zonelists(NULL);
2623 mutex_unlock(&zonelists_mutex);
2624 }
2318 } 2625 }
2319 return 0; 2626out:
2627 mutex_unlock(&zl_order_mutex);
2628 return ret;
2320} 2629}
2321 2630
2322 2631
@@ -2456,10 +2765,10 @@ static int default_zonelist_order(void)
2456 struct zone *z; 2765 struct zone *z;
2457 int average_size; 2766 int average_size;
2458 /* 2767 /*
2459 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. 2768 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2460 * If they are really small and used heavily, the system can fall 2769 * If they are really small and used heavily, the system can fall
2461 * into OOM very easily. 2770 * into OOM very easily.
2462 * This function detect ZONE_DMA/DMA32 size and confgigures zone order. 2771 * This function detect ZONE_DMA/DMA32 size and configures zone order.
2463 */ 2772 */
2464 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ 2773 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2465 low_kmem_size = 0; 2774 low_kmem_size = 0;
@@ -2471,6 +2780,15 @@ static int default_zonelist_order(void)
2471 if (zone_type < ZONE_NORMAL) 2780 if (zone_type < ZONE_NORMAL)
2472 low_kmem_size += z->present_pages; 2781 low_kmem_size += z->present_pages;
2473 total_size += z->present_pages; 2782 total_size += z->present_pages;
2783 } else if (zone_type == ZONE_NORMAL) {
2784 /*
2785 * If any node has only lowmem, then node order
2786 * is preferred to allow kernel allocations
2787 * locally; otherwise, they can easily infringe
2788 * on other nodes when there is an abundance of
2789 * lowmem available to allocate from.
2790 */
2791 return ZONELIST_ORDER_NODE;
2474 } 2792 }
2475 } 2793 }
2476 } 2794 }
@@ -2533,7 +2851,6 @@ static void build_zonelists(pg_data_t *pgdat)
2533 prev_node = local_node; 2851 prev_node = local_node;
2534 nodes_clear(used_mask); 2852 nodes_clear(used_mask);
2535 2853
2536 memset(node_load, 0, sizeof(node_load));
2537 memset(node_order, 0, sizeof(node_order)); 2854 memset(node_order, 0, sizeof(node_order));
2538 j = 0; 2855 j = 0;
2539 2856
@@ -2585,6 +2902,24 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2585 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); 2902 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2586} 2903}
2587 2904
2905#ifdef CONFIG_HAVE_MEMORYLESS_NODES
2906/*
2907 * Return node id of node used for "local" allocations.
2908 * I.e., first node id of first zone in arg node's generic zonelist.
2909 * Used for initializing percpu 'numa_mem', which is used primarily
2910 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
2911 */
2912int local_memory_node(int node)
2913{
2914 struct zone *zone;
2915
2916 (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
2917 gfp_zone(GFP_KERNEL),
2918 NULL,
2919 &zone);
2920 return zone->node;
2921}
2922#endif
2588 2923
2589#else /* CONFIG_NUMA */ 2924#else /* CONFIG_NUMA */
2590 2925
@@ -2637,21 +2972,85 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2637 2972
2638#endif /* CONFIG_NUMA */ 2973#endif /* CONFIG_NUMA */
2639 2974
2975/*
2976 * Boot pageset table. One per cpu which is going to be used for all
2977 * zones and all nodes. The parameters will be set in such a way
2978 * that an item put on a list will immediately be handed over to
2979 * the buddy list. This is safe since pageset manipulation is done
2980 * with interrupts disabled.
2981 *
2982 * The boot_pagesets must be kept even after bootup is complete for
2983 * unused processors and/or zones. They do play a role for bootstrapping
2984 * hotplugged processors.
2985 *
2986 * zoneinfo_show() and maybe other functions do
2987 * not check if the processor is online before following the pageset pointer.
2988 * Other parts of the kernel may not check if the zone is available.
2989 */
2990static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2991static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
2992static void setup_zone_pageset(struct zone *zone);
2993
2994/*
2995 * Global mutex to protect against size modification of zonelists
2996 * as well as to serialize pageset setup for the new populated zone.
2997 */
2998DEFINE_MUTEX(zonelists_mutex);
2999
2640/* return values int ....just for stop_machine() */ 3000/* return values int ....just for stop_machine() */
2641static int __build_all_zonelists(void *dummy) 3001static __init_refok int __build_all_zonelists(void *data)
2642{ 3002{
2643 int nid; 3003 int nid;
3004 int cpu;
2644 3005
3006#ifdef CONFIG_NUMA
3007 memset(node_load, 0, sizeof(node_load));
3008#endif
2645 for_each_online_node(nid) { 3009 for_each_online_node(nid) {
2646 pg_data_t *pgdat = NODE_DATA(nid); 3010 pg_data_t *pgdat = NODE_DATA(nid);
2647 3011
2648 build_zonelists(pgdat); 3012 build_zonelists(pgdat);
2649 build_zonelist_cache(pgdat); 3013 build_zonelist_cache(pgdat);
2650 } 3014 }
3015
3016 /*
3017 * Initialize the boot_pagesets that are going to be used
3018 * for bootstrapping processors. The real pagesets for
3019 * each zone will be allocated later when the per cpu
3020 * allocator is available.
3021 *
3022 * boot_pagesets are used also for bootstrapping offline
3023 * cpus if the system is already booted because the pagesets
3024 * are needed to initialize allocators on a specific cpu too.
3025 * F.e. the percpu allocator needs the page allocator which
3026 * needs the percpu allocator in order to allocate its pagesets
3027 * (a chicken-egg dilemma).
3028 */
3029 for_each_possible_cpu(cpu) {
3030 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3031
3032#ifdef CONFIG_HAVE_MEMORYLESS_NODES
3033 /*
3034 * We now know the "local memory node" for each node--
3035 * i.e., the node of the first zone in the generic zonelist.
3036 * Set up numa_mem percpu variable for on-line cpus. During
3037 * boot, only the boot cpu should be on-line; we'll init the
3038 * secondary cpus' numa_mem as they come on-line. During
3039 * node/memory hotplug, we'll fixup all on-line cpus.
3040 */
3041 if (cpu_online(cpu))
3042 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3043#endif
3044 }
3045
2651 return 0; 3046 return 0;
2652} 3047}
2653 3048
2654void build_all_zonelists(void) 3049/*
3050 * Called with zonelists_mutex held always
3051 * unless system_state == SYSTEM_BOOTING.
3052 */
3053void build_all_zonelists(void *data)
2655{ 3054{
2656 set_zonelist_order(); 3055 set_zonelist_order();
2657 3056
@@ -2662,6 +3061,10 @@ void build_all_zonelists(void)
2662 } else { 3061 } else {
2663 /* we have to stop all cpus to guarantee there is no user 3062 /* we have to stop all cpus to guarantee there is no user
2664 of zonelist */ 3063 of zonelist */
3064#ifdef CONFIG_MEMORY_HOTPLUG
3065 if (data)
3066 setup_zone_pageset((struct zone *)data);
3067#endif
2665 stop_machine(__build_all_zonelists, NULL, NULL); 3068 stop_machine(__build_all_zonelists, NULL, NULL);
2666 /* cpuset refresh routine should be here */ 3069 /* cpuset refresh routine should be here */
2667 } 3070 }
@@ -2768,7 +3171,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2768{ 3171{
2769 unsigned long start_pfn, pfn, end_pfn; 3172 unsigned long start_pfn, pfn, end_pfn;
2770 struct page *page; 3173 struct page *page;
2771 unsigned long reserve, block_migratetype; 3174 unsigned long block_migratetype;
3175 int reserve;
2772 3176
2773 /* Get the start pfn, end pfn and the number of blocks to reserve */ 3177 /* Get the start pfn, end pfn and the number of blocks to reserve */
2774 start_pfn = zone->zone_start_pfn; 3178 start_pfn = zone->zone_start_pfn;
@@ -2776,6 +3180,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2776 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3180 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2777 pageblock_order; 3181 pageblock_order;
2778 3182
3183 /*
3184 * Reserve blocks are generally in place to help high-order atomic
3185 * allocations that are short-lived. A min_free_kbytes value that
3186 * would result in more than 2 reserve blocks for atomic allocations
3187 * is assumed to be in place to help anti-fragmentation for the
3188 * future allocation of hugepages at runtime.
3189 */
3190 reserve = min(2, reserve);
3191
2779 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 3192 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2780 if (!pfn_valid(pfn)) 3193 if (!pfn_valid(pfn))
2781 continue; 3194 continue;
@@ -2946,6 +3359,7 @@ static int zone_batchsize(struct zone *zone)
2946static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 3359static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2947{ 3360{
2948 struct per_cpu_pages *pcp; 3361 struct per_cpu_pages *pcp;
3362 int migratetype;
2949 3363
2950 memset(p, 0, sizeof(*p)); 3364 memset(p, 0, sizeof(*p));
2951 3365
@@ -2953,7 +3367,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2953 pcp->count = 0; 3367 pcp->count = 0;
2954 pcp->high = 6 * batch; 3368 pcp->high = 6 * batch;
2955 pcp->batch = max(1UL, 1 * batch); 3369 pcp->batch = max(1UL, 1 * batch);
2956 INIT_LIST_HEAD(&pcp->list); 3370 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
3371 INIT_LIST_HEAD(&pcp->lists[migratetype]);
2957} 3372}
2958 3373
2959/* 3374/*
@@ -2973,121 +3388,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2973 pcp->batch = PAGE_SHIFT * 8; 3388 pcp->batch = PAGE_SHIFT * 8;
2974} 3389}
2975 3390
2976 3391static __meminit void setup_zone_pageset(struct zone *zone)
2977#ifdef CONFIG_NUMA
2978/*
2979 * Boot pageset table. One per cpu which is going to be used for all
2980 * zones and all nodes. The parameters will be set in such a way
2981 * that an item put on a list will immediately be handed over to
2982 * the buddy list. This is safe since pageset manipulation is done
2983 * with interrupts disabled.
2984 *
2985 * Some NUMA counter updates may also be caught by the boot pagesets.
2986 *
2987 * The boot_pagesets must be kept even after bootup is complete for
2988 * unused processors and/or zones. They do play a role for bootstrapping
2989 * hotplugged processors.
2990 *
2991 * zoneinfo_show() and maybe other functions do
2992 * not check if the processor is online before following the pageset pointer.
2993 * Other parts of the kernel may not check if the zone is available.
2994 */
2995static struct per_cpu_pageset boot_pageset[NR_CPUS];
2996
2997/*
2998 * Dynamically allocate memory for the
2999 * per cpu pageset array in struct zone.
3000 */
3001static int __cpuinit process_zones(int cpu)
3002{ 3392{
3003 struct zone *zone, *dzone; 3393 int cpu;
3004 int node = cpu_to_node(cpu);
3005 3394
3006 node_set_state(node, N_CPU); /* this node has a cpu */ 3395 zone->pageset = alloc_percpu(struct per_cpu_pageset);
3007 3396
3008 for_each_populated_zone(zone) { 3397 for_each_possible_cpu(cpu) {
3009 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3398 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3010 GFP_KERNEL, node);
3011 if (!zone_pcp(zone, cpu))
3012 goto bad;
3013 3399
3014 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 3400 setup_pageset(pcp, zone_batchsize(zone));
3015 3401
3016 if (percpu_pagelist_fraction) 3402 if (percpu_pagelist_fraction)
3017 setup_pagelist_highmark(zone_pcp(zone, cpu), 3403 setup_pagelist_highmark(pcp,
3018 (zone->present_pages / percpu_pagelist_fraction)); 3404 (zone->present_pages /
3019 } 3405 percpu_pagelist_fraction));
3020
3021 return 0;
3022bad:
3023 for_each_zone(dzone) {
3024 if (!populated_zone(dzone))
3025 continue;
3026 if (dzone == zone)
3027 break;
3028 kfree(zone_pcp(dzone, cpu));
3029 zone_pcp(dzone, cpu) = NULL;
3030 } 3406 }
3031 return -ENOMEM;
3032} 3407}
3033 3408
3034static inline void free_zone_pagesets(int cpu) 3409/*
3035{ 3410 * Allocate per cpu pagesets and initialize them.
3036 struct zone *zone; 3411 * Before this call only boot pagesets were available.
3037 3412 */
3038 for_each_zone(zone) {
3039 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
3040
3041 /* Free per_cpu_pageset if it is slab allocated */
3042 if (pset != &boot_pageset[cpu])
3043 kfree(pset);
3044 zone_pcp(zone, cpu) = NULL;
3045 }
3046}
3047
3048static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
3049 unsigned long action,
3050 void *hcpu)
3051{
3052 int cpu = (long)hcpu;
3053 int ret = NOTIFY_OK;
3054
3055 switch (action) {
3056 case CPU_UP_PREPARE:
3057 case CPU_UP_PREPARE_FROZEN:
3058 if (process_zones(cpu))
3059 ret = NOTIFY_BAD;
3060 break;
3061 case CPU_UP_CANCELED:
3062 case CPU_UP_CANCELED_FROZEN:
3063 case CPU_DEAD:
3064 case CPU_DEAD_FROZEN:
3065 free_zone_pagesets(cpu);
3066 break;
3067 default:
3068 break;
3069 }
3070 return ret;
3071}
3072
3073static struct notifier_block __cpuinitdata pageset_notifier =
3074 { &pageset_cpuup_callback, NULL, 0 };
3075
3076void __init setup_per_cpu_pageset(void) 3413void __init setup_per_cpu_pageset(void)
3077{ 3414{
3078 int err; 3415 struct zone *zone;
3079 3416
3080 /* Initialize per_cpu_pageset for cpu 0. 3417 for_each_populated_zone(zone)
3081 * A cpuup callback will do this for every cpu 3418 setup_zone_pageset(zone);
3082 * as it comes online
3083 */
3084 err = process_zones(smp_processor_id());
3085 BUG_ON(err);
3086 register_cpu_notifier(&pageset_notifier);
3087} 3419}
3088 3420
3089#endif
3090
3091static noinline __init_refok 3421static noinline __init_refok
3092int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 3422int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3093{ 3423{
@@ -3131,23 +3461,45 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3131 return 0; 3461 return 0;
3132} 3462}
3133 3463
3134static __meminit void zone_pcp_init(struct zone *zone) 3464static int __zone_pcp_update(void *data)
3135{ 3465{
3466 struct zone *zone = data;
3136 int cpu; 3467 int cpu;
3137 unsigned long batch = zone_batchsize(zone); 3468 unsigned long batch = zone_batchsize(zone), flags;
3138 3469
3139 for (cpu = 0; cpu < NR_CPUS; cpu++) { 3470 for_each_possible_cpu(cpu) {
3140#ifdef CONFIG_NUMA 3471 struct per_cpu_pageset *pset;
3141 /* Early boot. Slab allocator not functional yet */ 3472 struct per_cpu_pages *pcp;
3142 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 3473
3143 setup_pageset(&boot_pageset[cpu],0); 3474 pset = per_cpu_ptr(zone->pageset, cpu);
3144#else 3475 pcp = &pset->pcp;
3145 setup_pageset(zone_pcp(zone,cpu), batch); 3476
3146#endif 3477 local_irq_save(flags);
3478 free_pcppages_bulk(zone, pcp->count, pcp);
3479 setup_pageset(pset, batch);
3480 local_irq_restore(flags);
3147 } 3481 }
3482 return 0;
3483}
3484
3485void zone_pcp_update(struct zone *zone)
3486{
3487 stop_machine(__zone_pcp_update, zone, NULL);
3488}
3489
3490static __meminit void zone_pcp_init(struct zone *zone)
3491{
3492 /*
3493 * per cpu subsystem is not up at this point. The following code
3494 * relies on the ability of the linker to provide the
3495 * offset of a (static) per cpu variable into the per cpu area.
3496 */
3497 zone->pageset = &boot_pageset;
3498
3148 if (zone->present_pages) 3499 if (zone->present_pages)
3149 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 3500 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
3150 zone->name, zone->present_pages, batch); 3501 zone->name, zone->present_pages,
3502 zone_batchsize(zone));
3151} 3503}
3152 3504
3153__meminit int init_currently_empty_zone(struct zone *zone, 3505__meminit int init_currently_empty_zone(struct zone *zone,
@@ -3286,6 +3638,84 @@ void __init free_bootmem_with_active_regions(int nid,
3286 } 3638 }
3287} 3639}
3288 3640
3641#ifdef CONFIG_HAVE_MEMBLOCK
3642u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3643 u64 goal, u64 limit)
3644{
3645 int i;
3646
3647 /* Need to go over early_node_map to find out good range for node */
3648 for_each_active_range_index_in_nid(i, nid) {
3649 u64 addr;
3650 u64 ei_start, ei_last;
3651 u64 final_start, final_end;
3652
3653 ei_last = early_node_map[i].end_pfn;
3654 ei_last <<= PAGE_SHIFT;
3655 ei_start = early_node_map[i].start_pfn;
3656 ei_start <<= PAGE_SHIFT;
3657
3658 final_start = max(ei_start, goal);
3659 final_end = min(ei_last, limit);
3660
3661 if (final_start >= final_end)
3662 continue;
3663
3664 addr = memblock_find_in_range(final_start, final_end, size, align);
3665
3666 if (addr == MEMBLOCK_ERROR)
3667 continue;
3668
3669 return addr;
3670 }
3671
3672 return MEMBLOCK_ERROR;
3673}
3674#endif
3675
3676int __init add_from_early_node_map(struct range *range, int az,
3677 int nr_range, int nid)
3678{
3679 int i;
3680 u64 start, end;
3681
3682 /* need to go over early_node_map to find out good range for node */
3683 for_each_active_range_index_in_nid(i, nid) {
3684 start = early_node_map[i].start_pfn;
3685 end = early_node_map[i].end_pfn;
3686 nr_range = add_range(range, az, nr_range, start, end);
3687 }
3688 return nr_range;
3689}
3690
3691#ifdef CONFIG_NO_BOOTMEM
3692void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
3693 u64 goal, u64 limit)
3694{
3695 void *ptr;
3696 u64 addr;
3697
3698 if (limit > memblock.current_limit)
3699 limit = memblock.current_limit;
3700
3701 addr = find_memory_core_early(nid, size, align, goal, limit);
3702
3703 if (addr == MEMBLOCK_ERROR)
3704 return NULL;
3705
3706 ptr = phys_to_virt(addr);
3707 memset(ptr, 0, size);
3708 memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
3709 /*
3710 * The min_count is set to 0 so that bootmem allocated blocks
3711 * are never reported as leaks.
3712 */
3713 kmemleak_alloc(ptr, size, 0, 0);
3714 return ptr;
3715}
3716#endif
3717
3718
3289void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3719void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3290{ 3720{
3291 int i; 3721 int i;
@@ -3435,7 +3865,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3435 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 3865 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3436 * then all holes in the requested range will be accounted for. 3866 * then all holes in the requested range will be accounted for.
3437 */ 3867 */
3438static unsigned long __meminit __absent_pages_in_range(int nid, 3868unsigned long __meminit __absent_pages_in_range(int nid,
3439 unsigned long range_start_pfn, 3869 unsigned long range_start_pfn,
3440 unsigned long range_end_pfn) 3870 unsigned long range_end_pfn)
3441{ 3871{
@@ -3700,12 +4130,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3700 zone_seqlock_init(zone); 4130 zone_seqlock_init(zone);
3701 zone->zone_pgdat = pgdat; 4131 zone->zone_pgdat = pgdat;
3702 4132
3703 zone->prev_priority = DEF_PRIORITY;
3704
3705 zone_pcp_init(zone); 4133 zone_pcp_init(zone);
3706 for_each_lru(l) { 4134 for_each_lru(l) {
3707 INIT_LIST_HEAD(&zone->lru[l].list); 4135 INIT_LIST_HEAD(&zone->lru[l].list);
3708 zone->lru[l].nr_saved_scan = 0; 4136 zone->reclaim_stat.nr_saved_scan[l] = 0;
3709 } 4137 }
3710 zone->reclaim_stat.recent_rotated[0] = 0; 4138 zone->reclaim_stat.recent_rotated[0] = 0;
3711 zone->reclaim_stat.recent_rotated[1] = 0; 4139 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3850,7 +4278,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3850 } 4278 }
3851 4279
3852 /* Merge backward if suitable */ 4280 /* Merge backward if suitable */
3853 if (start_pfn < early_node_map[i].end_pfn && 4281 if (start_pfn < early_node_map[i].start_pfn &&
3854 end_pfn >= early_node_map[i].start_pfn) { 4282 end_pfn >= early_node_map[i].start_pfn) {
3855 early_node_map[i].start_pfn = start_pfn; 4283 early_node_map[i].start_pfn = start_pfn;
3856 return; 4284 return;
@@ -3964,7 +4392,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
3964} 4392}
3965 4393
3966/* sort the node_map by start_pfn */ 4394/* sort the node_map by start_pfn */
3967static void __init sort_node_map(void) 4395void __init sort_node_map(void)
3968{ 4396{
3969 sort(early_node_map, (size_t)nr_nodemap_entries, 4397 sort(early_node_map, (size_t)nr_nodemap_entries,
3970 sizeof(struct node_active_region), 4398 sizeof(struct node_active_region),
@@ -4032,6 +4460,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4032 int i, nid; 4460 int i, nid;
4033 unsigned long usable_startpfn; 4461 unsigned long usable_startpfn;
4034 unsigned long kernelcore_node, kernelcore_remaining; 4462 unsigned long kernelcore_node, kernelcore_remaining;
4463 /* save the state before borrow the nodemask */
4464 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
4035 unsigned long totalpages = early_calculate_totalpages(); 4465 unsigned long totalpages = early_calculate_totalpages();
4036 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4466 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
4037 4467
@@ -4059,7 +4489,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
4059 4489
4060 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4490 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
4061 if (!required_kernelcore) 4491 if (!required_kernelcore)
4062 return; 4492 goto out;
4063 4493
4064 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4494 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
4065 find_usable_zone_for_movable(); 4495 find_usable_zone_for_movable();
@@ -4158,6 +4588,10 @@ restart:
4158 for (nid = 0; nid < MAX_NUMNODES; nid++) 4588 for (nid = 0; nid < MAX_NUMNODES; nid++)
4159 zone_movable_pfn[nid] = 4589 zone_movable_pfn[nid] =
4160 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4590 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4591
4592out:
4593 /* restore the node_state */
4594 node_states[N_HIGH_MEMORY] = saved_node_state;
4161} 4595}
4162 4596
4163/* Any regular memory on that node ? */ 4597/* Any regular memory on that node ? */
@@ -4222,8 +4656,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4222 for (i = 0; i < MAX_NR_ZONES; i++) { 4656 for (i = 0; i < MAX_NR_ZONES; i++) {
4223 if (i == ZONE_MOVABLE) 4657 if (i == ZONE_MOVABLE)
4224 continue; 4658 continue;
4225 printk(" %-8s %0#10lx -> %0#10lx\n", 4659 printk(" %-8s ", zone_names[i]);
4226 zone_names[i], 4660 if (arch_zone_lowest_possible_pfn[i] ==
4661 arch_zone_highest_possible_pfn[i])
4662 printk("empty\n");
4663 else
4664 printk("%0#10lx -> %0#10lx\n",
4227 arch_zone_lowest_possible_pfn[i], 4665 arch_zone_lowest_possible_pfn[i],
4228 arch_zone_highest_possible_pfn[i]); 4666 arch_zone_highest_possible_pfn[i]);
4229 } 4667 }
@@ -4242,11 +4680,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4242 early_node_map[i].start_pfn, 4680 early_node_map[i].start_pfn,
4243 early_node_map[i].end_pfn); 4681 early_node_map[i].end_pfn);
4244 4682
4245 /*
4246 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4247 * that node_mask, clear it at first
4248 */
4249 nodes_clear(node_states[N_HIGH_MEMORY]);
4250 /* Initialise every node */ 4683 /* Initialise every node */
4251 mminit_verify_pageflags_layout(); 4684 mminit_verify_pageflags_layout();
4252 setup_nr_node_ids(); 4685 setup_nr_node_ids();
@@ -4317,7 +4750,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4317} 4750}
4318 4751
4319#ifndef CONFIG_NEED_MULTIPLE_NODES 4752#ifndef CONFIG_NEED_MULTIPLE_NODES
4320struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; 4753struct pglist_data __refdata contig_page_data = {
4754#ifndef CONFIG_NO_BOOTMEM
4755 .bdata = &bootmem_node_data[0]
4756#endif
4757 };
4321EXPORT_SYMBOL(contig_page_data); 4758EXPORT_SYMBOL(contig_page_data);
4322#endif 4759#endif
4323 4760
@@ -4493,7 +4930,7 @@ void setup_per_zone_wmarks(void)
4493 calculate_totalreserve_pages(); 4930 calculate_totalreserve_pages();
4494} 4931}
4495 4932
4496/** 4933/*
4497 * The inactive anon list should be small enough that the VM never has to 4934 * The inactive anon list should be small enough that the VM never has to
4498 * do too much work, but large enough that each inactive page has a chance 4935 * do too much work, but large enough that each inactive page has a chance
4499 * to be referenced again before it is swapped out. 4936 * to be referenced again before it is swapped out.
@@ -4584,9 +5021,9 @@ module_init(init_per_zone_wmark_min)
4584 * changes. 5021 * changes.
4585 */ 5022 */
4586int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5023int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4587 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 5024 void __user *buffer, size_t *length, loff_t *ppos)
4588{ 5025{
4589 proc_dointvec(table, write, file, buffer, length, ppos); 5026 proc_dointvec(table, write, buffer, length, ppos);
4590 if (write) 5027 if (write)
4591 setup_per_zone_wmarks(); 5028 setup_per_zone_wmarks();
4592 return 0; 5029 return 0;
@@ -4594,12 +5031,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4594 5031
4595#ifdef CONFIG_NUMA 5032#ifdef CONFIG_NUMA
4596int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 5033int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4597 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 5034 void __user *buffer, size_t *length, loff_t *ppos)
4598{ 5035{
4599 struct zone *zone; 5036 struct zone *zone;
4600 int rc; 5037 int rc;
4601 5038
4602 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 5039 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4603 if (rc) 5040 if (rc)
4604 return rc; 5041 return rc;
4605 5042
@@ -4610,12 +5047,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
4610} 5047}
4611 5048
4612int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 5049int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4613 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 5050 void __user *buffer, size_t *length, loff_t *ppos)
4614{ 5051{
4615 struct zone *zone; 5052 struct zone *zone;
4616 int rc; 5053 int rc;
4617 5054
4618 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 5055 rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
4619 if (rc) 5056 if (rc)
4620 return rc; 5057 return rc;
4621 5058
@@ -4636,9 +5073,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4636 * if in function of the boot time zone sizes. 5073 * if in function of the boot time zone sizes.
4637 */ 5074 */
4638int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 5075int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4639 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 5076 void __user *buffer, size_t *length, loff_t *ppos)
4640{ 5077{
4641 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 5078 proc_dointvec_minmax(table, write, buffer, length, ppos);
4642 setup_per_zone_lowmem_reserve(); 5079 setup_per_zone_lowmem_reserve();
4643 return 0; 5080 return 0;
4644} 5081}
@@ -4650,20 +5087,21 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
4650 */ 5087 */
4651 5088
4652int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5089int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4653 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 5090 void __user *buffer, size_t *length, loff_t *ppos)
4654{ 5091{
4655 struct zone *zone; 5092 struct zone *zone;
4656 unsigned int cpu; 5093 unsigned int cpu;
4657 int ret; 5094 int ret;
4658 5095
4659 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 5096 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
4660 if (!write || (ret == -EINVAL)) 5097 if (!write || (ret == -EINVAL))
4661 return ret; 5098 return ret;
4662 for_each_zone(zone) { 5099 for_each_populated_zone(zone) {
4663 for_each_online_cpu(cpu) { 5100 for_each_possible_cpu(cpu) {
4664 unsigned long high; 5101 unsigned long high;
4665 high = zone->present_pages / percpu_pagelist_fraction; 5102 high = zone->present_pages / percpu_pagelist_fraction;
4666 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 5103 setup_pagelist_highmark(
5104 per_cpu_ptr(zone->pageset, cpu), high);
4667 } 5105 }
4668 } 5106 }
4669 return 0; 5107 return 0;
@@ -4716,7 +5154,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4716 numentries <<= (PAGE_SHIFT - scale); 5154 numentries <<= (PAGE_SHIFT - scale);
4717 5155
4718 /* Make sure we've got at least a 0-order allocation.. */ 5156 /* Make sure we've got at least a 0-order allocation.. */
4719 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 5157 if (unlikely(flags & HASH_SMALL)) {
5158 /* Makes no sense without HASH_EARLY */
5159 WARN_ON(!(flags & HASH_EARLY));
5160 if (!(numentries >> *_hash_shift)) {
5161 numentries = 1UL << *_hash_shift;
5162 BUG_ON(!numentries);
5163 }
5164 } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
4720 numentries = PAGE_SIZE / bucketsize; 5165 numentries = PAGE_SIZE / bucketsize;
4721 } 5166 }
4722 numentries = roundup_pow_of_two(numentries); 5167 numentries = roundup_pow_of_two(numentries);
@@ -4744,17 +5189,19 @@ void *__init alloc_large_system_hash(const char *tablename,
4744 * some pages at the end of hash table which 5189 * some pages at the end of hash table which
4745 * alloc_pages_exact() automatically does 5190 * alloc_pages_exact() automatically does
4746 */ 5191 */
4747 if (get_order(size) < MAX_ORDER) 5192 if (get_order(size) < MAX_ORDER) {
4748 table = alloc_pages_exact(size, GFP_ATOMIC); 5193 table = alloc_pages_exact(size, GFP_ATOMIC);
5194 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
5195 }
4749 } 5196 }
4750 } while (!table && size > PAGE_SIZE && --log2qty); 5197 } while (!table && size > PAGE_SIZE && --log2qty);
4751 5198
4752 if (!table) 5199 if (!table)
4753 panic("Failed to allocate %s hash table\n", tablename); 5200 panic("Failed to allocate %s hash table\n", tablename);
4754 5201
4755 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", 5202 printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
4756 tablename, 5203 tablename,
4757 (1U << log2qty), 5204 (1UL << log2qty),
4758 ilog2(size) - PAGE_SHIFT, 5205 ilog2(size) - PAGE_SHIFT,
4759 size); 5206 size);
4760 5207
@@ -4763,16 +5210,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4763 if (_hash_mask) 5210 if (_hash_mask)
4764 *_hash_mask = (1 << log2qty) - 1; 5211 *_hash_mask = (1 << log2qty) - 1;
4765 5212
4766 /*
4767 * If hashdist is set, the table allocation is done with __vmalloc()
4768 * which invokes the kmemleak_alloc() callback. This function may also
4769 * be called before the slab and kmemleak are initialised when
4770 * kmemleak simply buffers the request to be executed later
4771 * (GFP_ATOMIC flag ignored in this case).
4772 */
4773 if (!hashdist)
4774 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4775
4776 return table; 5213 return table;
4777} 5214}
4778 5215
@@ -4861,23 +5298,113 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
4861 * page allocater never alloc memory from ISOLATE block. 5298 * page allocater never alloc memory from ISOLATE block.
4862 */ 5299 */
4863 5300
5301static int
5302__count_immobile_pages(struct zone *zone, struct page *page, int count)
5303{
5304 unsigned long pfn, iter, found;
5305 /*
5306 * For avoiding noise data, lru_add_drain_all() should be called
5307 * If ZONE_MOVABLE, the zone never contains immobile pages
5308 */
5309 if (zone_idx(zone) == ZONE_MOVABLE)
5310 return true;
5311
5312 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5313 return true;
5314
5315 pfn = page_to_pfn(page);
5316 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5317 unsigned long check = pfn + iter;
5318
5319 if (!pfn_valid_within(check)) {
5320 iter++;
5321 continue;
5322 }
5323 page = pfn_to_page(check);
5324 if (!page_count(page)) {
5325 if (PageBuddy(page))
5326 iter += (1 << page_order(page)) - 1;
5327 continue;
5328 }
5329 if (!PageLRU(page))
5330 found++;
5331 /*
5332 * If there are RECLAIMABLE pages, we need to check it.
5333 * But now, memory offline itself doesn't call shrink_slab()
5334 * and it still to be fixed.
5335 */
5336 /*
5337 * If the page is not RAM, page_count()should be 0.
5338 * we don't need more check. This is an _used_ not-movable page.
5339 *
5340 * The problematic thing here is PG_reserved pages. PG_reserved
5341 * is set to both of a memory hole page and a _used_ kernel
5342 * page at boot.
5343 */
5344 if (found > count)
5345 return false;
5346 }
5347 return true;
5348}
5349
5350bool is_pageblock_removable_nolock(struct page *page)
5351{
5352 struct zone *zone = page_zone(page);
5353 return __count_immobile_pages(zone, page, 0);
5354}
5355
4864int set_migratetype_isolate(struct page *page) 5356int set_migratetype_isolate(struct page *page)
4865{ 5357{
4866 struct zone *zone; 5358 struct zone *zone;
4867 unsigned long flags; 5359 unsigned long flags, pfn;
5360 struct memory_isolate_notify arg;
5361 int notifier_ret;
4868 int ret = -EBUSY; 5362 int ret = -EBUSY;
5363 int zone_idx;
4869 5364
4870 zone = page_zone(page); 5365 zone = page_zone(page);
5366 zone_idx = zone_idx(zone);
5367
4871 spin_lock_irqsave(&zone->lock, flags); 5368 spin_lock_irqsave(&zone->lock, flags);
5369
5370 pfn = page_to_pfn(page);
5371 arg.start_pfn = pfn;
5372 arg.nr_pages = pageblock_nr_pages;
5373 arg.pages_found = 0;
5374
4872 /* 5375 /*
4873 * In future, more migrate types will be able to be isolation target. 5376 * It may be possible to isolate a pageblock even if the
5377 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5378 * notifier chain is used by balloon drivers to return the
5379 * number of pages in a range that are held by the balloon
5380 * driver to shrink memory. If all the pages are accounted for
5381 * by balloons, are free, or on the LRU, isolation can continue.
5382 * Later, for example, when memory hotplug notifier runs, these
5383 * pages reported as "can be isolated" should be isolated(freed)
5384 * by the balloon driver through the memory notifier chain.
4874 */ 5385 */
4875 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) 5386 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5387 notifier_ret = notifier_to_errno(notifier_ret);
5388 if (notifier_ret)
4876 goto out; 5389 goto out;
4877 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5390 /*
4878 move_freepages_block(zone, page, MIGRATE_ISOLATE); 5391 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
4879 ret = 0; 5392 * We just check MOVABLE pages.
5393 */
5394 if (__count_immobile_pages(zone, page, arg.pages_found))
5395 ret = 0;
5396
5397 /*
5398 * immobile means "not-on-lru" paes. If immobile is larger than
5399 * removable-by-driver pages reported by notifier, we'll fail.
5400 */
5401
4880out: 5402out:
5403 if (!ret) {
5404 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5405 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5406 }
5407
4881 spin_unlock_irqrestore(&zone->lock, flags); 5408 spin_unlock_irqrestore(&zone->lock, flags);
4882 if (!ret) 5409 if (!ret)
4883 drain_all_pages(); 5410 drain_all_pages();
@@ -4944,3 +5471,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4944 spin_unlock_irqrestore(&zone->lock, flags); 5471 spin_unlock_irqrestore(&zone->lock, flags);
4945} 5472}
4946#endif 5473#endif
5474
5475#ifdef CONFIG_MEMORY_FAILURE
5476bool is_free_buddy_page(struct page *page)
5477{
5478 struct zone *zone = page_zone(page);
5479 unsigned long pfn = page_to_pfn(page);
5480 unsigned long flags;
5481 int order;
5482
5483 spin_lock_irqsave(&zone->lock, flags);
5484 for (order = 0; order < MAX_ORDER; order++) {
5485 struct page *page_head = page - (pfn & ((1 << order) - 1));
5486
5487 if (PageBuddy(page_head) && page_order(page_head) >= order)
5488 break;
5489 }
5490 spin_unlock_irqrestore(&zone->lock, flags);
5491
5492 return order < MAX_ORDER;
5493}
5494#endif
5495
5496static struct trace_print_flags pageflag_names[] = {
5497 {1UL << PG_locked, "locked" },
5498 {1UL << PG_error, "error" },
5499 {1UL << PG_referenced, "referenced" },
5500 {1UL << PG_uptodate, "uptodate" },
5501 {1UL << PG_dirty, "dirty" },
5502 {1UL << PG_lru, "lru" },
5503 {1UL << PG_active, "active" },
5504 {1UL << PG_slab, "slab" },
5505 {1UL << PG_owner_priv_1, "owner_priv_1" },
5506 {1UL << PG_arch_1, "arch_1" },
5507 {1UL << PG_reserved, "reserved" },
5508 {1UL << PG_private, "private" },
5509 {1UL << PG_private_2, "private_2" },
5510 {1UL << PG_writeback, "writeback" },
5511#ifdef CONFIG_PAGEFLAGS_EXTENDED
5512 {1UL << PG_head, "head" },
5513 {1UL << PG_tail, "tail" },
5514#else
5515 {1UL << PG_compound, "compound" },
5516#endif
5517 {1UL << PG_swapcache, "swapcache" },
5518 {1UL << PG_mappedtodisk, "mappedtodisk" },
5519 {1UL << PG_reclaim, "reclaim" },
5520 {1UL << PG_buddy, "buddy" },
5521 {1UL << PG_swapbacked, "swapbacked" },
5522 {1UL << PG_unevictable, "unevictable" },
5523#ifdef CONFIG_MMU
5524 {1UL << PG_mlocked, "mlocked" },
5525#endif
5526#ifdef CONFIG_ARCH_USES_PG_UNCACHED
5527 {1UL << PG_uncached, "uncached" },
5528#endif
5529#ifdef CONFIG_MEMORY_FAILURE
5530 {1UL << PG_hwpoison, "hwpoison" },
5531#endif
5532 {-1UL, NULL },
5533};
5534
5535static void dump_page_flags(unsigned long flags)
5536{
5537 const char *delim = "";
5538 unsigned long mask;
5539 int i;
5540
5541 printk(KERN_ALERT "page flags: %#lx(", flags);
5542
5543 /* remove zone id */
5544 flags &= (1UL << NR_PAGEFLAGS) - 1;
5545
5546 for (i = 0; pageflag_names[i].name && flags; i++) {
5547
5548 mask = pageflag_names[i].mask;
5549 if ((flags & mask) != mask)
5550 continue;
5551
5552 flags &= ~mask;
5553 printk("%s%s", delim, pageflag_names[i].name);
5554 delim = "|";
5555 }
5556
5557 /* check for left over flags */
5558 if (flags)
5559 printk("%s%#lx", delim, flags);
5560
5561 printk(")\n");
5562}
5563
5564void dump_page(struct page *page)
5565{
5566 printk(KERN_ALERT
5567 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5568 page, page_count(page), page_mapcount(page),
5569 page->mapping, page->index);
5570 dump_page_flags(page->flags);
5571}