diff options
author | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 01:06:28 -0500 |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 01:06:28 -0500 |
commit | f43dc23d5ea91fca257be02138a255f02d98e806 (patch) | |
tree | b29722f6e965316e90ac97abf79923ced250dc21 /mm/page_alloc.c | |
parent | f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff) | |
parent | 4162cf64973df51fc885825bc9ca4d055891c49f (diff) |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts:
arch/sh/kernel/cpu/sh2/setup-sh7619.c
arch/sh/kernel/cpu/sh2a/setup-mxg.c
arch/sh/kernel/cpu/sh2a/setup-sh7201.c
arch/sh/kernel/cpu/sh2a/setup-sh7203.c
arch/sh/kernel/cpu/sh2a/setup-sh7206.c
arch/sh/kernel/cpu/sh3/setup-sh7705.c
arch/sh/kernel/cpu/sh3/setup-sh770x.c
arch/sh/kernel/cpu/sh3/setup-sh7710.c
arch/sh/kernel/cpu/sh3/setup-sh7720.c
arch/sh/kernel/cpu/sh4/setup-sh4-202.c
arch/sh/kernel/cpu/sh4/setup-sh7750.c
arch/sh/kernel/cpu/sh4/setup-sh7760.c
arch/sh/kernel/cpu/sh4a/setup-sh7343.c
arch/sh/kernel/cpu/sh4a/setup-sh7366.c
arch/sh/kernel/cpu/sh4a/setup-sh7722.c
arch/sh/kernel/cpu/sh4a/setup-sh7723.c
arch/sh/kernel/cpu/sh4a/setup-sh7724.c
arch/sh/kernel/cpu/sh4a/setup-sh7763.c
arch/sh/kernel/cpu/sh4a/setup-sh7770.c
arch/sh/kernel/cpu/sh4a/setup-sh7780.c
arch/sh/kernel/cpu/sh4a/setup-sh7785.c
arch/sh/kernel/cpu/sh4a/setup-sh7786.c
arch/sh/kernel/cpu/sh4a/setup-shx3.c
arch/sh/kernel/cpu/sh5/setup-sh5.c
drivers/serial/sh-sci.c
drivers/serial/sh-sci.h
include/linux/serial_sci.h
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 1307 |
1 files changed, 966 insertions, 341 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30d5093a099d..ff7e15872398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
25 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
26 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
@@ -48,11 +49,31 @@ | |||
48 | #include <linux/page_cgroup.h> | 49 | #include <linux/page_cgroup.h> |
49 | #include <linux/debugobjects.h> | 50 | #include <linux/debugobjects.h> |
50 | #include <linux/kmemleak.h> | 51 | #include <linux/kmemleak.h> |
52 | #include <linux/memory.h> | ||
53 | #include <linux/compaction.h> | ||
54 | #include <trace/events/kmem.h> | ||
55 | #include <linux/ftrace_event.h> | ||
51 | 56 | ||
52 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
53 | #include <asm/div64.h> | 58 | #include <asm/div64.h> |
54 | #include "internal.h" | 59 | #include "internal.h" |
55 | 60 | ||
61 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | ||
62 | DEFINE_PER_CPU(int, numa_node); | ||
63 | EXPORT_PER_CPU_SYMBOL(numa_node); | ||
64 | #endif | ||
65 | |||
66 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
67 | /* | ||
68 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | ||
69 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. | ||
70 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() | ||
71 | * defined in <linux/topology.h>. | ||
72 | */ | ||
73 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | ||
74 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | ||
75 | #endif | ||
76 | |||
56 | /* | 77 | /* |
57 | * Array of node states. | 78 | * Array of node states. |
58 | */ | 79 | */ |
@@ -71,10 +92,39 @@ EXPORT_SYMBOL(node_states); | |||
71 | 92 | ||
72 | unsigned long totalram_pages __read_mostly; | 93 | unsigned long totalram_pages __read_mostly; |
73 | unsigned long totalreserve_pages __read_mostly; | 94 | unsigned long totalreserve_pages __read_mostly; |
74 | unsigned long highest_memmap_pfn __read_mostly; | ||
75 | int percpu_pagelist_fraction; | 95 | int percpu_pagelist_fraction; |
76 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 96 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
77 | 97 | ||
98 | #ifdef CONFIG_PM_SLEEP | ||
99 | /* | ||
100 | * The following functions are used by the suspend/hibernate code to temporarily | ||
101 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | ||
102 | * while devices are suspended. To avoid races with the suspend/hibernate code, | ||
103 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | ||
104 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | ||
105 | * guaranteed not to run in parallel with that modification). | ||
106 | */ | ||
107 | |||
108 | static gfp_t saved_gfp_mask; | ||
109 | |||
110 | void pm_restore_gfp_mask(void) | ||
111 | { | ||
112 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
113 | if (saved_gfp_mask) { | ||
114 | gfp_allowed_mask = saved_gfp_mask; | ||
115 | saved_gfp_mask = 0; | ||
116 | } | ||
117 | } | ||
118 | |||
119 | void pm_restrict_gfp_mask(void) | ||
120 | { | ||
121 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
122 | WARN_ON(saved_gfp_mask); | ||
123 | saved_gfp_mask = gfp_allowed_mask; | ||
124 | gfp_allowed_mask &= ~GFP_IOFS; | ||
125 | } | ||
126 | #endif /* CONFIG_PM_SLEEP */ | ||
127 | |||
78 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 128 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
79 | int pageblock_order __read_mostly; | 129 | int pageblock_order __read_mostly; |
80 | #endif | 130 | #endif |
@@ -123,8 +173,8 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
123 | 173 | ||
124 | int min_free_kbytes = 1024; | 174 | int min_free_kbytes = 1024; |
125 | 175 | ||
126 | unsigned long __meminitdata nr_kernel_pages; | 176 | static unsigned long __meminitdata nr_kernel_pages; |
127 | unsigned long __meminitdata nr_all_pages; | 177 | static unsigned long __meminitdata nr_all_pages; |
128 | static unsigned long __meminitdata dma_reserve; | 178 | static unsigned long __meminitdata dma_reserve; |
129 | 179 | ||
130 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 180 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
@@ -234,6 +284,12 @@ static void bad_page(struct page *page) | |||
234 | static unsigned long nr_shown; | 284 | static unsigned long nr_shown; |
235 | static unsigned long nr_unshown; | 285 | static unsigned long nr_unshown; |
236 | 286 | ||
287 | /* Don't complain about poisoned pages */ | ||
288 | if (PageHWPoison(page)) { | ||
289 | __ClearPageBuddy(page); | ||
290 | return; | ||
291 | } | ||
292 | |||
237 | /* | 293 | /* |
238 | * Allow a burst of 60 reports, then keep quiet for that minute; | 294 | * Allow a burst of 60 reports, then keep quiet for that minute; |
239 | * or allow a steady drip of one report per second. | 295 | * or allow a steady drip of one report per second. |
@@ -256,10 +312,7 @@ static void bad_page(struct page *page) | |||
256 | 312 | ||
257 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 313 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
258 | current->comm, page_to_pfn(page)); | 314 | current->comm, page_to_pfn(page)); |
259 | printk(KERN_ALERT | 315 | dump_page(page); |
260 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
261 | page, (void *)page->flags, page_count(page), | ||
262 | page_mapcount(page), page->mapping, page->index); | ||
263 | 316 | ||
264 | dump_stack(); | 317 | dump_stack(); |
265 | out: | 318 | out: |
@@ -445,6 +498,8 @@ static inline void __free_one_page(struct page *page, | |||
445 | int migratetype) | 498 | int migratetype) |
446 | { | 499 | { |
447 | unsigned long page_idx; | 500 | unsigned long page_idx; |
501 | unsigned long combined_idx; | ||
502 | struct page *buddy; | ||
448 | 503 | ||
449 | if (unlikely(PageCompound(page))) | 504 | if (unlikely(PageCompound(page))) |
450 | if (unlikely(destroy_compound_page(page, order))) | 505 | if (unlikely(destroy_compound_page(page, order))) |
@@ -458,9 +513,6 @@ static inline void __free_one_page(struct page *page, | |||
458 | VM_BUG_ON(bad_range(zone, page)); | 513 | VM_BUG_ON(bad_range(zone, page)); |
459 | 514 | ||
460 | while (order < MAX_ORDER-1) { | 515 | while (order < MAX_ORDER-1) { |
461 | unsigned long combined_idx; | ||
462 | struct page *buddy; | ||
463 | |||
464 | buddy = __page_find_buddy(page, page_idx, order); | 516 | buddy = __page_find_buddy(page, page_idx, order); |
465 | if (!page_is_buddy(page, buddy, order)) | 517 | if (!page_is_buddy(page, buddy, order)) |
466 | break; | 518 | break; |
@@ -475,12 +527,32 @@ static inline void __free_one_page(struct page *page, | |||
475 | order++; | 527 | order++; |
476 | } | 528 | } |
477 | set_page_order(page, order); | 529 | set_page_order(page, order); |
478 | list_add(&page->lru, | 530 | |
479 | &zone->free_area[order].free_list[migratetype]); | 531 | /* |
532 | * If this is not the largest possible page, check if the buddy | ||
533 | * of the next-highest order is free. If it is, it's possible | ||
534 | * that pages are being freed that will coalesce soon. In case, | ||
535 | * that is happening, add the free page to the tail of the list | ||
536 | * so it's less likely to be used soon and more likely to be merged | ||
537 | * as a higher order page | ||
538 | */ | ||
539 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | ||
540 | struct page *higher_page, *higher_buddy; | ||
541 | combined_idx = __find_combined_index(page_idx, order); | ||
542 | higher_page = page + combined_idx - page_idx; | ||
543 | higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); | ||
544 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | ||
545 | list_add_tail(&page->lru, | ||
546 | &zone->free_area[order].free_list[migratetype]); | ||
547 | goto out; | ||
548 | } | ||
549 | } | ||
550 | |||
551 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); | ||
552 | out: | ||
480 | zone->free_area[order].nr_free++; | 553 | zone->free_area[order].nr_free++; |
481 | } | 554 | } |
482 | 555 | ||
483 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
484 | /* | 556 | /* |
485 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 557 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
486 | * Page should not be on lru, so no need to fix that up. | 558 | * Page should not be on lru, so no need to fix that up. |
@@ -491,9 +563,6 @@ static inline void free_page_mlock(struct page *page) | |||
491 | __dec_zone_page_state(page, NR_MLOCK); | 563 | __dec_zone_page_state(page, NR_MLOCK); |
492 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 564 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
493 | } | 565 | } |
494 | #else | ||
495 | static void free_page_mlock(struct page *page) { } | ||
496 | #endif | ||
497 | 566 | ||
498 | static inline int free_pages_check(struct page *page) | 567 | static inline int free_pages_check(struct page *page) |
499 | { | 568 | { |
@@ -510,7 +579,7 @@ static inline int free_pages_check(struct page *page) | |||
510 | } | 579 | } |
511 | 580 | ||
512 | /* | 581 | /* |
513 | * Frees a list of pages. | 582 | * Frees a number of pages from the PCP lists |
514 | * Assumes all pages on list are in same zone, and of same order. | 583 | * Assumes all pages on list are in same zone, and of same order. |
515 | * count is the number of pages to free. | 584 | * count is the number of pages to free. |
516 | * | 585 | * |
@@ -520,23 +589,45 @@ static inline int free_pages_check(struct page *page) | |||
520 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 589 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
521 | * pinned" detection logic. | 590 | * pinned" detection logic. |
522 | */ | 591 | */ |
523 | static void free_pages_bulk(struct zone *zone, int count, | 592 | static void free_pcppages_bulk(struct zone *zone, int count, |
524 | struct list_head *list, int order) | 593 | struct per_cpu_pages *pcp) |
525 | { | 594 | { |
595 | int migratetype = 0; | ||
596 | int batch_free = 0; | ||
597 | int to_free = count; | ||
598 | |||
526 | spin_lock(&zone->lock); | 599 | spin_lock(&zone->lock); |
527 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 600 | zone->all_unreclaimable = 0; |
528 | zone->pages_scanned = 0; | 601 | zone->pages_scanned = 0; |
529 | 602 | ||
530 | __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); | 603 | while (to_free) { |
531 | while (count--) { | ||
532 | struct page *page; | 604 | struct page *page; |
605 | struct list_head *list; | ||
533 | 606 | ||
534 | VM_BUG_ON(list_empty(list)); | 607 | /* |
535 | page = list_entry(list->prev, struct page, lru); | 608 | * Remove pages from lists in a round-robin fashion. A |
536 | /* have to delete it as __free_one_page list manipulates */ | 609 | * batch_free count is maintained that is incremented when an |
537 | list_del(&page->lru); | 610 | * empty list is encountered. This is so more pages are freed |
538 | __free_one_page(page, zone, order, page_private(page)); | 611 | * off fuller lists instead of spinning excessively around empty |
612 | * lists | ||
613 | */ | ||
614 | do { | ||
615 | batch_free++; | ||
616 | if (++migratetype == MIGRATE_PCPTYPES) | ||
617 | migratetype = 0; | ||
618 | list = &pcp->lists[migratetype]; | ||
619 | } while (list_empty(list)); | ||
620 | |||
621 | do { | ||
622 | page = list_entry(list->prev, struct page, lru); | ||
623 | /* must delete as __free_one_page list manipulates */ | ||
624 | list_del(&page->lru); | ||
625 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | ||
626 | __free_one_page(page, zone, 0, page_private(page)); | ||
627 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
628 | } while (--to_free && --batch_free && !list_empty(list)); | ||
539 | } | 629 | } |
630 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
540 | spin_unlock(&zone->lock); | 631 | spin_unlock(&zone->lock); |
541 | } | 632 | } |
542 | 633 | ||
@@ -544,27 +635,31 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
544 | int migratetype) | 635 | int migratetype) |
545 | { | 636 | { |
546 | spin_lock(&zone->lock); | 637 | spin_lock(&zone->lock); |
547 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 638 | zone->all_unreclaimable = 0; |
548 | zone->pages_scanned = 0; | 639 | zone->pages_scanned = 0; |
549 | 640 | ||
550 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
551 | __free_one_page(page, zone, order, migratetype); | 641 | __free_one_page(page, zone, order, migratetype); |
642 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
552 | spin_unlock(&zone->lock); | 643 | spin_unlock(&zone->lock); |
553 | } | 644 | } |
554 | 645 | ||
555 | static void __free_pages_ok(struct page *page, unsigned int order) | 646 | static bool free_pages_prepare(struct page *page, unsigned int order) |
556 | { | 647 | { |
557 | unsigned long flags; | ||
558 | int i; | 648 | int i; |
559 | int bad = 0; | 649 | int bad = 0; |
560 | int wasMlocked = TestClearPageMlocked(page); | ||
561 | 650 | ||
651 | trace_mm_page_free_direct(page, order); | ||
562 | kmemcheck_free_shadow(page, order); | 652 | kmemcheck_free_shadow(page, order); |
563 | 653 | ||
564 | for (i = 0 ; i < (1 << order) ; ++i) | 654 | for (i = 0; i < (1 << order); i++) { |
565 | bad += free_pages_check(page + i); | 655 | struct page *pg = page + i; |
656 | |||
657 | if (PageAnon(pg)) | ||
658 | pg->mapping = NULL; | ||
659 | bad += free_pages_check(pg); | ||
660 | } | ||
566 | if (bad) | 661 | if (bad) |
567 | return; | 662 | return false; |
568 | 663 | ||
569 | if (!PageHighMem(page)) { | 664 | if (!PageHighMem(page)) { |
570 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 665 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
@@ -574,6 +669,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
574 | arch_free_page(page, order); | 669 | arch_free_page(page, order); |
575 | kernel_map_pages(page, 1 << order, 0); | 670 | kernel_map_pages(page, 1 << order, 0); |
576 | 671 | ||
672 | return true; | ||
673 | } | ||
674 | |||
675 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
676 | { | ||
677 | unsigned long flags; | ||
678 | int wasMlocked = __TestClearPageMlocked(page); | ||
679 | |||
680 | if (!free_pages_prepare(page, order)) | ||
681 | return; | ||
682 | |||
577 | local_irq_save(flags); | 683 | local_irq_save(flags); |
578 | if (unlikely(wasMlocked)) | 684 | if (unlikely(wasMlocked)) |
579 | free_page_mlock(page); | 685 | free_page_mlock(page); |
@@ -646,7 +752,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
646 | /* | 752 | /* |
647 | * This page is about to be returned from the page allocator | 753 | * This page is about to be returned from the page allocator |
648 | */ | 754 | */ |
649 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 755 | static inline int check_new_page(struct page *page) |
650 | { | 756 | { |
651 | if (unlikely(page_mapcount(page) | | 757 | if (unlikely(page_mapcount(page) | |
652 | (page->mapping != NULL) | | 758 | (page->mapping != NULL) | |
@@ -655,6 +761,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
655 | bad_page(page); | 761 | bad_page(page); |
656 | return 1; | 762 | return 1; |
657 | } | 763 | } |
764 | return 0; | ||
765 | } | ||
766 | |||
767 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | ||
768 | { | ||
769 | int i; | ||
770 | |||
771 | for (i = 0; i < (1 << order); i++) { | ||
772 | struct page *p = page + i; | ||
773 | if (unlikely(check_new_page(p))) | ||
774 | return 1; | ||
775 | } | ||
658 | 776 | ||
659 | set_page_private(page, 0); | 777 | set_page_private(page, 0); |
660 | set_page_refcounted(page); | 778 | set_page_refcounted(page); |
@@ -783,6 +901,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
783 | return move_freepages(zone, start_page, end_page, migratetype); | 901 | return move_freepages(zone, start_page, end_page, migratetype); |
784 | } | 902 | } |
785 | 903 | ||
904 | static void change_pageblock_range(struct page *pageblock_page, | ||
905 | int start_order, int migratetype) | ||
906 | { | ||
907 | int nr_pageblocks = 1 << (start_order - pageblock_order); | ||
908 | |||
909 | while (nr_pageblocks--) { | ||
910 | set_pageblock_migratetype(pageblock_page, migratetype); | ||
911 | pageblock_page += pageblock_nr_pages; | ||
912 | } | ||
913 | } | ||
914 | |||
786 | /* Remove an element from the buddy allocator from the fallback list */ | 915 | /* Remove an element from the buddy allocator from the fallback list */ |
787 | static inline struct page * | 916 | static inline struct page * |
788 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 917 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
@@ -817,13 +946,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
817 | * agressive about taking ownership of free pages | 946 | * agressive about taking ownership of free pages |
818 | */ | 947 | */ |
819 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 948 | if (unlikely(current_order >= (pageblock_order >> 1)) || |
820 | start_migratetype == MIGRATE_RECLAIMABLE) { | 949 | start_migratetype == MIGRATE_RECLAIMABLE || |
950 | page_group_by_mobility_disabled) { | ||
821 | unsigned long pages; | 951 | unsigned long pages; |
822 | pages = move_freepages_block(zone, page, | 952 | pages = move_freepages_block(zone, page, |
823 | start_migratetype); | 953 | start_migratetype); |
824 | 954 | ||
825 | /* Claim the whole block if over half of it is free */ | 955 | /* Claim the whole block if over half of it is free */ |
826 | if (pages >= (1 << (pageblock_order-1))) | 956 | if (pages >= (1 << (pageblock_order-1)) || |
957 | page_group_by_mobility_disabled) | ||
827 | set_pageblock_migratetype(page, | 958 | set_pageblock_migratetype(page, |
828 | start_migratetype); | 959 | start_migratetype); |
829 | 960 | ||
@@ -834,11 +965,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
834 | list_del(&page->lru); | 965 | list_del(&page->lru); |
835 | rmv_page_order(page); | 966 | rmv_page_order(page); |
836 | 967 | ||
837 | if (current_order == pageblock_order) | 968 | /* Take ownership for orders >= pageblock_order */ |
838 | set_pageblock_migratetype(page, | 969 | if (current_order >= pageblock_order) |
970 | change_pageblock_range(page, current_order, | ||
839 | start_migratetype); | 971 | start_migratetype); |
840 | 972 | ||
841 | expand(zone, page, order, current_order, area, migratetype); | 973 | expand(zone, page, order, current_order, area, migratetype); |
974 | |||
975 | trace_mm_page_alloc_extfrag(page, order, current_order, | ||
976 | start_migratetype, migratetype); | ||
977 | |||
842 | return page; | 978 | return page; |
843 | } | 979 | } |
844 | } | 980 | } |
@@ -872,6 +1008,7 @@ retry_reserve: | |||
872 | } | 1008 | } |
873 | } | 1009 | } |
874 | 1010 | ||
1011 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | ||
875 | return page; | 1012 | return page; |
876 | } | 1013 | } |
877 | 1014 | ||
@@ -882,7 +1019,7 @@ retry_reserve: | |||
882 | */ | 1019 | */ |
883 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1020 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
884 | unsigned long count, struct list_head *list, | 1021 | unsigned long count, struct list_head *list, |
885 | int migratetype) | 1022 | int migratetype, int cold) |
886 | { | 1023 | { |
887 | int i; | 1024 | int i; |
888 | 1025 | ||
@@ -901,7 +1038,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
901 | * merge IO requests if the physical pages are ordered | 1038 | * merge IO requests if the physical pages are ordered |
902 | * properly. | 1039 | * properly. |
903 | */ | 1040 | */ |
904 | list_add(&page->lru, list); | 1041 | if (likely(cold == 0)) |
1042 | list_add(&page->lru, list); | ||
1043 | else | ||
1044 | list_add_tail(&page->lru, list); | ||
905 | set_page_private(page, migratetype); | 1045 | set_page_private(page, migratetype); |
906 | list = &page->lru; | 1046 | list = &page->lru; |
907 | } | 1047 | } |
@@ -929,7 +1069,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
929 | to_drain = pcp->batch; | 1069 | to_drain = pcp->batch; |
930 | else | 1070 | else |
931 | to_drain = pcp->count; | 1071 | to_drain = pcp->count; |
932 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | 1072 | free_pcppages_bulk(zone, to_drain, pcp); |
933 | pcp->count -= to_drain; | 1073 | pcp->count -= to_drain; |
934 | local_irq_restore(flags); | 1074 | local_irq_restore(flags); |
935 | } | 1075 | } |
@@ -951,11 +1091,11 @@ static void drain_pages(unsigned int cpu) | |||
951 | struct per_cpu_pageset *pset; | 1091 | struct per_cpu_pageset *pset; |
952 | struct per_cpu_pages *pcp; | 1092 | struct per_cpu_pages *pcp; |
953 | 1093 | ||
954 | pset = zone_pcp(zone, cpu); | 1094 | local_irq_save(flags); |
1095 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
955 | 1096 | ||
956 | pcp = &pset->pcp; | 1097 | pcp = &pset->pcp; |
957 | local_irq_save(flags); | 1098 | free_pcppages_bulk(zone, pcp->count, pcp); |
958 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | ||
959 | pcp->count = 0; | 1099 | pcp->count = 0; |
960 | local_irq_restore(flags); | 1100 | local_irq_restore(flags); |
961 | } | 1101 | } |
@@ -1015,56 +1155,54 @@ void mark_free_pages(struct zone *zone) | |||
1015 | 1155 | ||
1016 | /* | 1156 | /* |
1017 | * Free a 0-order page | 1157 | * Free a 0-order page |
1158 | * cold == 1 ? free a cold page : free a hot page | ||
1018 | */ | 1159 | */ |
1019 | static void free_hot_cold_page(struct page *page, int cold) | 1160 | void free_hot_cold_page(struct page *page, int cold) |
1020 | { | 1161 | { |
1021 | struct zone *zone = page_zone(page); | 1162 | struct zone *zone = page_zone(page); |
1022 | struct per_cpu_pages *pcp; | 1163 | struct per_cpu_pages *pcp; |
1023 | unsigned long flags; | 1164 | unsigned long flags; |
1024 | int wasMlocked = TestClearPageMlocked(page); | 1165 | int migratetype; |
1025 | 1166 | int wasMlocked = __TestClearPageMlocked(page); | |
1026 | kmemcheck_free_shadow(page, 0); | ||
1027 | 1167 | ||
1028 | if (PageAnon(page)) | 1168 | if (!free_pages_prepare(page, 0)) |
1029 | page->mapping = NULL; | ||
1030 | if (free_pages_check(page)) | ||
1031 | return; | 1169 | return; |
1032 | 1170 | ||
1033 | if (!PageHighMem(page)) { | 1171 | migratetype = get_pageblock_migratetype(page); |
1034 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | 1172 | set_page_private(page, migratetype); |
1035 | debug_check_no_obj_freed(page_address(page), PAGE_SIZE); | ||
1036 | } | ||
1037 | arch_free_page(page, 0); | ||
1038 | kernel_map_pages(page, 1, 0); | ||
1039 | |||
1040 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
1041 | set_page_private(page, get_pageblock_migratetype(page)); | ||
1042 | local_irq_save(flags); | 1173 | local_irq_save(flags); |
1043 | if (unlikely(wasMlocked)) | 1174 | if (unlikely(wasMlocked)) |
1044 | free_page_mlock(page); | 1175 | free_page_mlock(page); |
1045 | __count_vm_event(PGFREE); | 1176 | __count_vm_event(PGFREE); |
1046 | 1177 | ||
1178 | /* | ||
1179 | * We only track unmovable, reclaimable and movable on pcp lists. | ||
1180 | * Free ISOLATE pages back to the allocator because they are being | ||
1181 | * offlined but treat RESERVE as movable pages so we can get those | ||
1182 | * areas back if necessary. Otherwise, we may have to free | ||
1183 | * excessively into the page allocator | ||
1184 | */ | ||
1185 | if (migratetype >= MIGRATE_PCPTYPES) { | ||
1186 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | ||
1187 | free_one_page(zone, page, 0, migratetype); | ||
1188 | goto out; | ||
1189 | } | ||
1190 | migratetype = MIGRATE_MOVABLE; | ||
1191 | } | ||
1192 | |||
1193 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
1047 | if (cold) | 1194 | if (cold) |
1048 | list_add_tail(&page->lru, &pcp->list); | 1195 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1049 | else | 1196 | else |
1050 | list_add(&page->lru, &pcp->list); | 1197 | list_add(&page->lru, &pcp->lists[migratetype]); |
1051 | pcp->count++; | 1198 | pcp->count++; |
1052 | if (pcp->count >= pcp->high) { | 1199 | if (pcp->count >= pcp->high) { |
1053 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1200 | free_pcppages_bulk(zone, pcp->batch, pcp); |
1054 | pcp->count -= pcp->batch; | 1201 | pcp->count -= pcp->batch; |
1055 | } | 1202 | } |
1056 | local_irq_restore(flags); | ||
1057 | put_cpu(); | ||
1058 | } | ||
1059 | 1203 | ||
1060 | void free_hot_page(struct page *page) | 1204 | out: |
1061 | { | 1205 | local_irq_restore(flags); |
1062 | free_hot_cold_page(page, 0); | ||
1063 | } | ||
1064 | |||
1065 | void free_cold_page(struct page *page) | ||
1066 | { | ||
1067 | free_hot_cold_page(page, 1); | ||
1068 | } | 1206 | } |
1069 | 1207 | ||
1070 | /* | 1208 | /* |
@@ -1096,6 +1234,51 @@ void split_page(struct page *page, unsigned int order) | |||
1096 | } | 1234 | } |
1097 | 1235 | ||
1098 | /* | 1236 | /* |
1237 | * Similar to split_page except the page is already free. As this is only | ||
1238 | * being used for migration, the migratetype of the block also changes. | ||
1239 | * As this is called with interrupts disabled, the caller is responsible | ||
1240 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1241 | * are enabled. | ||
1242 | * | ||
1243 | * Note: this is probably too low level an operation for use in drivers. | ||
1244 | * Please consult with lkml before using this in your driver. | ||
1245 | */ | ||
1246 | int split_free_page(struct page *page) | ||
1247 | { | ||
1248 | unsigned int order; | ||
1249 | unsigned long watermark; | ||
1250 | struct zone *zone; | ||
1251 | |||
1252 | BUG_ON(!PageBuddy(page)); | ||
1253 | |||
1254 | zone = page_zone(page); | ||
1255 | order = page_order(page); | ||
1256 | |||
1257 | /* Obey watermarks as if the page was being allocated */ | ||
1258 | watermark = low_wmark_pages(zone) + (1 << order); | ||
1259 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | ||
1260 | return 0; | ||
1261 | |||
1262 | /* Remove page from free list */ | ||
1263 | list_del(&page->lru); | ||
1264 | zone->free_area[order].nr_free--; | ||
1265 | rmv_page_order(page); | ||
1266 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
1267 | |||
1268 | /* Split into individual pages */ | ||
1269 | set_page_refcounted(page); | ||
1270 | split_page(page, order); | ||
1271 | |||
1272 | if (order >= pageblock_order - 1) { | ||
1273 | struct page *endpage = page + (1 << order) - 1; | ||
1274 | for (; page < endpage; page += pageblock_nr_pages) | ||
1275 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
1276 | } | ||
1277 | |||
1278 | return 1 << order; | ||
1279 | } | ||
1280 | |||
1281 | /* | ||
1099 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1282 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
1100 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1283 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1101 | * or two. | 1284 | * or two. |
@@ -1108,39 +1291,27 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1108 | unsigned long flags; | 1291 | unsigned long flags; |
1109 | struct page *page; | 1292 | struct page *page; |
1110 | int cold = !!(gfp_flags & __GFP_COLD); | 1293 | int cold = !!(gfp_flags & __GFP_COLD); |
1111 | int cpu; | ||
1112 | 1294 | ||
1113 | again: | 1295 | again: |
1114 | cpu = get_cpu(); | ||
1115 | if (likely(order == 0)) { | 1296 | if (likely(order == 0)) { |
1116 | struct per_cpu_pages *pcp; | 1297 | struct per_cpu_pages *pcp; |
1298 | struct list_head *list; | ||
1117 | 1299 | ||
1118 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
1119 | local_irq_save(flags); | 1300 | local_irq_save(flags); |
1120 | if (!pcp->count) { | 1301 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1121 | pcp->count = rmqueue_bulk(zone, 0, | 1302 | list = &pcp->lists[migratetype]; |
1122 | pcp->batch, &pcp->list, migratetype); | 1303 | if (list_empty(list)) { |
1123 | if (unlikely(!pcp->count)) | 1304 | pcp->count += rmqueue_bulk(zone, 0, |
1305 | pcp->batch, list, | ||
1306 | migratetype, cold); | ||
1307 | if (unlikely(list_empty(list))) | ||
1124 | goto failed; | 1308 | goto failed; |
1125 | } | 1309 | } |
1126 | 1310 | ||
1127 | /* Find a page of the appropriate migrate type */ | 1311 | if (cold) |
1128 | if (cold) { | 1312 | page = list_entry(list->prev, struct page, lru); |
1129 | list_for_each_entry_reverse(page, &pcp->list, lru) | 1313 | else |
1130 | if (page_private(page) == migratetype) | 1314 | page = list_entry(list->next, struct page, lru); |
1131 | break; | ||
1132 | } else { | ||
1133 | list_for_each_entry(page, &pcp->list, lru) | ||
1134 | if (page_private(page) == migratetype) | ||
1135 | break; | ||
1136 | } | ||
1137 | |||
1138 | /* Allocate more to the pcp list if necessary */ | ||
1139 | if (unlikely(&page->lru == &pcp->list)) { | ||
1140 | pcp->count += rmqueue_bulk(zone, 0, | ||
1141 | pcp->batch, &pcp->list, migratetype); | ||
1142 | page = list_entry(pcp->list.next, struct page, lru); | ||
1143 | } | ||
1144 | 1315 | ||
1145 | list_del(&page->lru); | 1316 | list_del(&page->lru); |
1146 | pcp->count--; | 1317 | pcp->count--; |
@@ -1153,23 +1324,22 @@ again: | |||
1153 | * properly detect and handle allocation failures. | 1324 | * properly detect and handle allocation failures. |
1154 | * | 1325 | * |
1155 | * We most definitely don't want callers attempting to | 1326 | * We most definitely don't want callers attempting to |
1156 | * allocate greater than single-page units with | 1327 | * allocate greater than order-1 page units with |
1157 | * __GFP_NOFAIL. | 1328 | * __GFP_NOFAIL. |
1158 | */ | 1329 | */ |
1159 | WARN_ON_ONCE(order > 0); | 1330 | WARN_ON_ONCE(order > 1); |
1160 | } | 1331 | } |
1161 | spin_lock_irqsave(&zone->lock, flags); | 1332 | spin_lock_irqsave(&zone->lock, flags); |
1162 | page = __rmqueue(zone, order, migratetype); | 1333 | page = __rmqueue(zone, order, migratetype); |
1163 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1164 | spin_unlock(&zone->lock); | 1334 | spin_unlock(&zone->lock); |
1165 | if (!page) | 1335 | if (!page) |
1166 | goto failed; | 1336 | goto failed; |
1337 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
1167 | } | 1338 | } |
1168 | 1339 | ||
1169 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1340 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1170 | zone_statistics(preferred_zone, zone); | 1341 | zone_statistics(preferred_zone, zone); |
1171 | local_irq_restore(flags); | 1342 | local_irq_restore(flags); |
1172 | put_cpu(); | ||
1173 | 1343 | ||
1174 | VM_BUG_ON(bad_range(zone, page)); | 1344 | VM_BUG_ON(bad_range(zone, page)); |
1175 | if (prep_new_page(page, order, gfp_flags)) | 1345 | if (prep_new_page(page, order, gfp_flags)) |
@@ -1178,7 +1348,6 @@ again: | |||
1178 | 1348 | ||
1179 | failed: | 1349 | failed: |
1180 | local_irq_restore(flags); | 1350 | local_irq_restore(flags); |
1181 | put_cpu(); | ||
1182 | return NULL; | 1351 | return NULL; |
1183 | } | 1352 | } |
1184 | 1353 | ||
@@ -1299,7 +1468,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1299 | { | 1468 | { |
1300 | /* free_pages my go negative - that's OK */ | 1469 | /* free_pages my go negative - that's OK */ |
1301 | long min = mark; | 1470 | long min = mark; |
1302 | long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; | 1471 | long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; |
1303 | int o; | 1472 | int o; |
1304 | 1473 | ||
1305 | if (alloc_flags & ALLOC_HIGH) | 1474 | if (alloc_flags & ALLOC_HIGH) |
@@ -1576,7 +1745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1576 | struct page *page; | 1745 | struct page *page; |
1577 | 1746 | ||
1578 | /* Acquire the OOM killer lock for the zones in zonelist */ | 1747 | /* Acquire the OOM killer lock for the zones in zonelist */ |
1579 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | 1748 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { |
1580 | schedule_timeout_uninterruptible(1); | 1749 | schedule_timeout_uninterruptible(1); |
1581 | return NULL; | 1750 | return NULL; |
1582 | } | 1751 | } |
@@ -1593,18 +1762,87 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1593 | if (page) | 1762 | if (page) |
1594 | goto out; | 1763 | goto out; |
1595 | 1764 | ||
1596 | /* The OOM killer will not help higher order allocs */ | 1765 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1597 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1766 | /* The OOM killer will not help higher order allocs */ |
1598 | goto out; | 1767 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1599 | 1768 | goto out; | |
1769 | /* The OOM killer does not needlessly kill tasks for lowmem */ | ||
1770 | if (high_zoneidx < ZONE_NORMAL) | ||
1771 | goto out; | ||
1772 | /* | ||
1773 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1774 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1775 | * The caller should handle page allocation failure by itself if | ||
1776 | * it specifies __GFP_THISNODE. | ||
1777 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1778 | */ | ||
1779 | if (gfp_mask & __GFP_THISNODE) | ||
1780 | goto out; | ||
1781 | } | ||
1600 | /* Exhausted what can be done so it's blamo time */ | 1782 | /* Exhausted what can be done so it's blamo time */ |
1601 | out_of_memory(zonelist, gfp_mask, order); | 1783 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1602 | 1784 | ||
1603 | out: | 1785 | out: |
1604 | clear_zonelist_oom(zonelist, gfp_mask); | 1786 | clear_zonelist_oom(zonelist, gfp_mask); |
1605 | return page; | 1787 | return page; |
1606 | } | 1788 | } |
1607 | 1789 | ||
1790 | #ifdef CONFIG_COMPACTION | ||
1791 | /* Try memory compaction for high-order allocations before reclaim */ | ||
1792 | static struct page * | ||
1793 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
1794 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1795 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1796 | int migratetype, unsigned long *did_some_progress) | ||
1797 | { | ||
1798 | struct page *page; | ||
1799 | |||
1800 | if (!order || compaction_deferred(preferred_zone)) | ||
1801 | return NULL; | ||
1802 | |||
1803 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | ||
1804 | nodemask); | ||
1805 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
1806 | |||
1807 | /* Page migration frees to the PCP lists but we want merging */ | ||
1808 | drain_pages(get_cpu()); | ||
1809 | put_cpu(); | ||
1810 | |||
1811 | page = get_page_from_freelist(gfp_mask, nodemask, | ||
1812 | order, zonelist, high_zoneidx, | ||
1813 | alloc_flags, preferred_zone, | ||
1814 | migratetype); | ||
1815 | if (page) { | ||
1816 | preferred_zone->compact_considered = 0; | ||
1817 | preferred_zone->compact_defer_shift = 0; | ||
1818 | count_vm_event(COMPACTSUCCESS); | ||
1819 | return page; | ||
1820 | } | ||
1821 | |||
1822 | /* | ||
1823 | * It's bad if compaction run occurs and fails. | ||
1824 | * The most likely reason is that pages exist, | ||
1825 | * but not enough to satisfy watermarks. | ||
1826 | */ | ||
1827 | count_vm_event(COMPACTFAIL); | ||
1828 | defer_compaction(preferred_zone); | ||
1829 | |||
1830 | cond_resched(); | ||
1831 | } | ||
1832 | |||
1833 | return NULL; | ||
1834 | } | ||
1835 | #else | ||
1836 | static inline struct page * | ||
1837 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | ||
1838 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1839 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
1840 | int migratetype, unsigned long *did_some_progress) | ||
1841 | { | ||
1842 | return NULL; | ||
1843 | } | ||
1844 | #endif /* CONFIG_COMPACTION */ | ||
1845 | |||
1608 | /* The really slow allocator path where we enter direct reclaim */ | 1846 | /* The really slow allocator path where we enter direct reclaim */ |
1609 | static inline struct page * | 1847 | static inline struct page * |
1610 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 1848 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
@@ -1615,15 +1853,12 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1615 | struct page *page = NULL; | 1853 | struct page *page = NULL; |
1616 | struct reclaim_state reclaim_state; | 1854 | struct reclaim_state reclaim_state; |
1617 | struct task_struct *p = current; | 1855 | struct task_struct *p = current; |
1856 | bool drained = false; | ||
1618 | 1857 | ||
1619 | cond_resched(); | 1858 | cond_resched(); |
1620 | 1859 | ||
1621 | /* We now go into synchronous reclaim */ | 1860 | /* We now go into synchronous reclaim */ |
1622 | cpuset_memory_pressure_bump(); | 1861 | cpuset_memory_pressure_bump(); |
1623 | |||
1624 | /* | ||
1625 | * The task's cpuset might have expanded its set of allowable nodes | ||
1626 | */ | ||
1627 | p->flags |= PF_MEMALLOC; | 1862 | p->flags |= PF_MEMALLOC; |
1628 | lockdep_set_current_reclaim_state(gfp_mask); | 1863 | lockdep_set_current_reclaim_state(gfp_mask); |
1629 | reclaim_state.reclaimed_slab = 0; | 1864 | reclaim_state.reclaimed_slab = 0; |
@@ -1637,14 +1872,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1637 | 1872 | ||
1638 | cond_resched(); | 1873 | cond_resched(); |
1639 | 1874 | ||
1640 | if (order != 0) | 1875 | if (unlikely(!(*did_some_progress))) |
1641 | drain_all_pages(); | 1876 | return NULL; |
1642 | 1877 | ||
1643 | if (likely(*did_some_progress)) | 1878 | retry: |
1644 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1879 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1645 | zonelist, high_zoneidx, | 1880 | zonelist, high_zoneidx, |
1646 | alloc_flags, preferred_zone, | 1881 | alloc_flags, preferred_zone, |
1647 | migratetype); | 1882 | migratetype); |
1883 | |||
1884 | /* | ||
1885 | * If an allocation failed after direct reclaim, it could be because | ||
1886 | * pages are pinned on the per-cpu lists. Drain them and try again | ||
1887 | */ | ||
1888 | if (!page && !drained) { | ||
1889 | drain_all_pages(); | ||
1890 | drained = true; | ||
1891 | goto retry; | ||
1892 | } | ||
1893 | |||
1648 | return page; | 1894 | return page; |
1649 | } | 1895 | } |
1650 | 1896 | ||
@@ -1666,7 +1912,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
1666 | preferred_zone, migratetype); | 1912 | preferred_zone, migratetype); |
1667 | 1913 | ||
1668 | if (!page && gfp_mask & __GFP_NOFAIL) | 1914 | if (!page && gfp_mask & __GFP_NOFAIL) |
1669 | congestion_wait(WRITE, HZ/50); | 1915 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
1670 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 1916 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
1671 | 1917 | ||
1672 | return page; | 1918 | return page; |
@@ -1691,7 +1937,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1691 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1937 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1692 | 1938 | ||
1693 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ | 1939 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
1694 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); | 1940 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
1695 | 1941 | ||
1696 | /* | 1942 | /* |
1697 | * The caller may dip into page reserves a bit more if the caller | 1943 | * The caller may dip into page reserves a bit more if the caller |
@@ -1699,7 +1945,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1699 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 1945 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
1700 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | 1946 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). |
1701 | */ | 1947 | */ |
1702 | alloc_flags |= (gfp_mask & __GFP_HIGH); | 1948 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
1703 | 1949 | ||
1704 | if (!wait) { | 1950 | if (!wait) { |
1705 | alloc_flags |= ALLOC_HARDER; | 1951 | alloc_flags |= ALLOC_HARDER; |
@@ -1708,7 +1954,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
1708 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1954 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1709 | */ | 1955 | */ |
1710 | alloc_flags &= ~ALLOC_CPUSET; | 1956 | alloc_flags &= ~ALLOC_CPUSET; |
1711 | } else if (unlikely(rt_task(p))) | 1957 | } else if (unlikely(rt_task(p)) && !in_interrupt()) |
1712 | alloc_flags |= ALLOC_HARDER; | 1958 | alloc_flags |= ALLOC_HARDER; |
1713 | 1959 | ||
1714 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 1960 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
@@ -1740,8 +1986,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1740 | * be using allocators in order of preference for an area that is | 1986 | * be using allocators in order of preference for an area that is |
1741 | * too large. | 1987 | * too large. |
1742 | */ | 1988 | */ |
1743 | if (WARN_ON_ONCE(order >= MAX_ORDER)) | 1989 | if (order >= MAX_ORDER) { |
1990 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); | ||
1744 | return NULL; | 1991 | return NULL; |
1992 | } | ||
1745 | 1993 | ||
1746 | /* | 1994 | /* |
1747 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1995 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1754,6 +2002,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1754 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2002 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1755 | goto nopage; | 2003 | goto nopage; |
1756 | 2004 | ||
2005 | restart: | ||
1757 | wake_all_kswapd(order, zonelist, high_zoneidx); | 2006 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1758 | 2007 | ||
1759 | /* | 2008 | /* |
@@ -1763,7 +2012,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
1763 | */ | 2012 | */ |
1764 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2013 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
1765 | 2014 | ||
1766 | restart: | ||
1767 | /* This is the last chance, in general, before the goto nopage. */ | 2015 | /* This is the last chance, in general, before the goto nopage. */ |
1768 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2016 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1769 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2017 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -1789,6 +2037,19 @@ rebalance: | |||
1789 | if (p->flags & PF_MEMALLOC) | 2037 | if (p->flags & PF_MEMALLOC) |
1790 | goto nopage; | 2038 | goto nopage; |
1791 | 2039 | ||
2040 | /* Avoid allocations with no watermarks from looping endlessly */ | ||
2041 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | ||
2042 | goto nopage; | ||
2043 | |||
2044 | /* Try direct compaction */ | ||
2045 | page = __alloc_pages_direct_compact(gfp_mask, order, | ||
2046 | zonelist, high_zoneidx, | ||
2047 | nodemask, | ||
2048 | alloc_flags, preferred_zone, | ||
2049 | migratetype, &did_some_progress); | ||
2050 | if (page) | ||
2051 | goto got_pg; | ||
2052 | |||
1792 | /* Try direct reclaim and then allocating */ | 2053 | /* Try direct reclaim and then allocating */ |
1793 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2054 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
1794 | zonelist, high_zoneidx, | 2055 | zonelist, high_zoneidx, |
@@ -1813,15 +2074,23 @@ rebalance: | |||
1813 | if (page) | 2074 | if (page) |
1814 | goto got_pg; | 2075 | goto got_pg; |
1815 | 2076 | ||
1816 | /* | 2077 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1817 | * The OOM killer does not trigger for high-order | 2078 | /* |
1818 | * ~__GFP_NOFAIL allocations so if no progress is being | 2079 | * The oom killer is not called for high-order |
1819 | * made, there are no other options and retrying is | 2080 | * allocations that may fail, so if no progress |
1820 | * unlikely to help. | 2081 | * is being made, there are no other options and |
1821 | */ | 2082 | * retrying is unlikely to help. |
1822 | if (order > PAGE_ALLOC_COSTLY_ORDER && | 2083 | */ |
1823 | !(gfp_mask & __GFP_NOFAIL)) | 2084 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1824 | goto nopage; | 2085 | goto nopage; |
2086 | /* | ||
2087 | * The oom killer is not called for lowmem | ||
2088 | * allocations to prevent needlessly killing | ||
2089 | * innocent tasks. | ||
2090 | */ | ||
2091 | if (high_zoneidx < ZONE_NORMAL) | ||
2092 | goto nopage; | ||
2093 | } | ||
1825 | 2094 | ||
1826 | goto restart; | 2095 | goto restart; |
1827 | } | 2096 | } |
@@ -1831,7 +2100,7 @@ rebalance: | |||
1831 | pages_reclaimed += did_some_progress; | 2100 | pages_reclaimed += did_some_progress; |
1832 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2101 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1833 | /* Wait for some write requests to complete then retry */ | 2102 | /* Wait for some write requests to complete then retry */ |
1834 | congestion_wait(WRITE, HZ/50); | 2103 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
1835 | goto rebalance; | 2104 | goto rebalance; |
1836 | } | 2105 | } |
1837 | 2106 | ||
@@ -1880,10 +2149,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
1880 | if (unlikely(!zonelist->_zonerefs->zone)) | 2149 | if (unlikely(!zonelist->_zonerefs->zone)) |
1881 | return NULL; | 2150 | return NULL; |
1882 | 2151 | ||
2152 | get_mems_allowed(); | ||
1883 | /* The preferred zone is used for statistics later */ | 2153 | /* The preferred zone is used for statistics later */ |
1884 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | 2154 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); |
1885 | if (!preferred_zone) | 2155 | if (!preferred_zone) { |
2156 | put_mems_allowed(); | ||
1886 | return NULL; | 2157 | return NULL; |
2158 | } | ||
1887 | 2159 | ||
1888 | /* First allocation attempt */ | 2160 | /* First allocation attempt */ |
1889 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2161 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -1893,7 +2165,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
1893 | page = __alloc_pages_slowpath(gfp_mask, order, | 2165 | page = __alloc_pages_slowpath(gfp_mask, order, |
1894 | zonelist, high_zoneidx, nodemask, | 2166 | zonelist, high_zoneidx, nodemask, |
1895 | preferred_zone, migratetype); | 2167 | preferred_zone, migratetype); |
2168 | put_mems_allowed(); | ||
1896 | 2169 | ||
2170 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | ||
1897 | return page; | 2171 | return page; |
1898 | } | 2172 | } |
1899 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2173 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -1903,46 +2177,42 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); | |||
1903 | */ | 2177 | */ |
1904 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 2178 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
1905 | { | 2179 | { |
1906 | struct page * page; | 2180 | struct page *page; |
2181 | |||
2182 | /* | ||
2183 | * __get_free_pages() returns a 32-bit address, which cannot represent | ||
2184 | * a highmem page | ||
2185 | */ | ||
2186 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | ||
2187 | |||
1907 | page = alloc_pages(gfp_mask, order); | 2188 | page = alloc_pages(gfp_mask, order); |
1908 | if (!page) | 2189 | if (!page) |
1909 | return 0; | 2190 | return 0; |
1910 | return (unsigned long) page_address(page); | 2191 | return (unsigned long) page_address(page); |
1911 | } | 2192 | } |
1912 | |||
1913 | EXPORT_SYMBOL(__get_free_pages); | 2193 | EXPORT_SYMBOL(__get_free_pages); |
1914 | 2194 | ||
1915 | unsigned long get_zeroed_page(gfp_t gfp_mask) | 2195 | unsigned long get_zeroed_page(gfp_t gfp_mask) |
1916 | { | 2196 | { |
1917 | struct page * page; | 2197 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); |
1918 | |||
1919 | /* | ||
1920 | * get_zeroed_page() returns a 32-bit address, which cannot represent | ||
1921 | * a highmem page | ||
1922 | */ | ||
1923 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | ||
1924 | |||
1925 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | ||
1926 | if (page) | ||
1927 | return (unsigned long) page_address(page); | ||
1928 | return 0; | ||
1929 | } | 2198 | } |
1930 | |||
1931 | EXPORT_SYMBOL(get_zeroed_page); | 2199 | EXPORT_SYMBOL(get_zeroed_page); |
1932 | 2200 | ||
1933 | void __pagevec_free(struct pagevec *pvec) | 2201 | void __pagevec_free(struct pagevec *pvec) |
1934 | { | 2202 | { |
1935 | int i = pagevec_count(pvec); | 2203 | int i = pagevec_count(pvec); |
1936 | 2204 | ||
1937 | while (--i >= 0) | 2205 | while (--i >= 0) { |
2206 | trace_mm_pagevec_free(pvec->pages[i], pvec->cold); | ||
1938 | free_hot_cold_page(pvec->pages[i], pvec->cold); | 2207 | free_hot_cold_page(pvec->pages[i], pvec->cold); |
2208 | } | ||
1939 | } | 2209 | } |
1940 | 2210 | ||
1941 | void __free_pages(struct page *page, unsigned int order) | 2211 | void __free_pages(struct page *page, unsigned int order) |
1942 | { | 2212 | { |
1943 | if (put_page_testzero(page)) { | 2213 | if (put_page_testzero(page)) { |
1944 | if (order == 0) | 2214 | if (order == 0) |
1945 | free_hot_page(page); | 2215 | free_hot_cold_page(page, 0); |
1946 | else | 2216 | else |
1947 | __free_pages_ok(page, order); | 2217 | __free_pages_ok(page, order); |
1948 | } | 2218 | } |
@@ -1983,7 +2253,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | |||
1983 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | 2253 | unsigned long alloc_end = addr + (PAGE_SIZE << order); |
1984 | unsigned long used = addr + PAGE_ALIGN(size); | 2254 | unsigned long used = addr + PAGE_ALIGN(size); |
1985 | 2255 | ||
1986 | split_page(virt_to_page(addr), order); | 2256 | split_page(virt_to_page((void *)addr), order); |
1987 | while (used < alloc_end) { | 2257 | while (used < alloc_end) { |
1988 | free_page(used); | 2258 | free_page(used); |
1989 | used += PAGE_SIZE; | 2259 | used += PAGE_SIZE; |
@@ -2107,7 +2377,7 @@ void show_free_areas(void) | |||
2107 | for_each_online_cpu(cpu) { | 2377 | for_each_online_cpu(cpu) { |
2108 | struct per_cpu_pageset *pageset; | 2378 | struct per_cpu_pageset *pageset; |
2109 | 2379 | ||
2110 | pageset = zone_pcp(zone, cpu); | 2380 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2111 | 2381 | ||
2112 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2382 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2113 | cpu, pageset->pcp.high, | 2383 | cpu, pageset->pcp.high, |
@@ -2115,23 +2385,27 @@ void show_free_areas(void) | |||
2115 | } | 2385 | } |
2116 | } | 2386 | } |
2117 | 2387 | ||
2118 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2388 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
2119 | " inactive_file:%lu" | 2389 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
2120 | " unevictable:%lu" | 2390 | " unevictable:%lu" |
2121 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2391 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2122 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2392 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2393 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | ||
2123 | global_page_state(NR_ACTIVE_ANON), | 2394 | global_page_state(NR_ACTIVE_ANON), |
2124 | global_page_state(NR_ACTIVE_FILE), | ||
2125 | global_page_state(NR_INACTIVE_ANON), | 2395 | global_page_state(NR_INACTIVE_ANON), |
2396 | global_page_state(NR_ISOLATED_ANON), | ||
2397 | global_page_state(NR_ACTIVE_FILE), | ||
2126 | global_page_state(NR_INACTIVE_FILE), | 2398 | global_page_state(NR_INACTIVE_FILE), |
2399 | global_page_state(NR_ISOLATED_FILE), | ||
2127 | global_page_state(NR_UNEVICTABLE), | 2400 | global_page_state(NR_UNEVICTABLE), |
2128 | global_page_state(NR_FILE_DIRTY), | 2401 | global_page_state(NR_FILE_DIRTY), |
2129 | global_page_state(NR_WRITEBACK), | 2402 | global_page_state(NR_WRITEBACK), |
2130 | global_page_state(NR_UNSTABLE_NFS), | 2403 | global_page_state(NR_UNSTABLE_NFS), |
2131 | global_page_state(NR_FREE_PAGES), | 2404 | global_page_state(NR_FREE_PAGES), |
2132 | global_page_state(NR_SLAB_RECLAIMABLE) + | 2405 | global_page_state(NR_SLAB_RECLAIMABLE), |
2133 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 2406 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
2134 | global_page_state(NR_FILE_MAPPED), | 2407 | global_page_state(NR_FILE_MAPPED), |
2408 | global_page_state(NR_SHMEM), | ||
2135 | global_page_state(NR_PAGETABLE), | 2409 | global_page_state(NR_PAGETABLE), |
2136 | global_page_state(NR_BOUNCE)); | 2410 | global_page_state(NR_BOUNCE)); |
2137 | 2411 | ||
@@ -2149,12 +2423,26 @@ void show_free_areas(void) | |||
2149 | " active_file:%lukB" | 2423 | " active_file:%lukB" |
2150 | " inactive_file:%lukB" | 2424 | " inactive_file:%lukB" |
2151 | " unevictable:%lukB" | 2425 | " unevictable:%lukB" |
2426 | " isolated(anon):%lukB" | ||
2427 | " isolated(file):%lukB" | ||
2152 | " present:%lukB" | 2428 | " present:%lukB" |
2429 | " mlocked:%lukB" | ||
2430 | " dirty:%lukB" | ||
2431 | " writeback:%lukB" | ||
2432 | " mapped:%lukB" | ||
2433 | " shmem:%lukB" | ||
2434 | " slab_reclaimable:%lukB" | ||
2435 | " slab_unreclaimable:%lukB" | ||
2436 | " kernel_stack:%lukB" | ||
2437 | " pagetables:%lukB" | ||
2438 | " unstable:%lukB" | ||
2439 | " bounce:%lukB" | ||
2440 | " writeback_tmp:%lukB" | ||
2153 | " pages_scanned:%lu" | 2441 | " pages_scanned:%lu" |
2154 | " all_unreclaimable? %s" | 2442 | " all_unreclaimable? %s" |
2155 | "\n", | 2443 | "\n", |
2156 | zone->name, | 2444 | zone->name, |
2157 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2445 | K(zone_nr_free_pages(zone)), |
2158 | K(min_wmark_pages(zone)), | 2446 | K(min_wmark_pages(zone)), |
2159 | K(low_wmark_pages(zone)), | 2447 | K(low_wmark_pages(zone)), |
2160 | K(high_wmark_pages(zone)), | 2448 | K(high_wmark_pages(zone)), |
@@ -2163,9 +2451,24 @@ void show_free_areas(void) | |||
2163 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2451 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
2164 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2452 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
2165 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2453 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
2454 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | ||
2455 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | ||
2166 | K(zone->present_pages), | 2456 | K(zone->present_pages), |
2457 | K(zone_page_state(zone, NR_MLOCK)), | ||
2458 | K(zone_page_state(zone, NR_FILE_DIRTY)), | ||
2459 | K(zone_page_state(zone, NR_WRITEBACK)), | ||
2460 | K(zone_page_state(zone, NR_FILE_MAPPED)), | ||
2461 | K(zone_page_state(zone, NR_SHMEM)), | ||
2462 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | ||
2463 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | ||
2464 | zone_page_state(zone, NR_KERNEL_STACK) * | ||
2465 | THREAD_SIZE / 1024, | ||
2466 | K(zone_page_state(zone, NR_PAGETABLE)), | ||
2467 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | ||
2468 | K(zone_page_state(zone, NR_BOUNCE)), | ||
2469 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | ||
2167 | zone->pages_scanned, | 2470 | zone->pages_scanned, |
2168 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2471 | (zone->all_unreclaimable ? "yes" : "no") |
2169 | ); | 2472 | ); |
2170 | printk("lowmem_reserve[]:"); | 2473 | printk("lowmem_reserve[]:"); |
2171 | for (i = 0; i < MAX_NR_ZONES; i++) | 2474 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -2292,18 +2595,19 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); | |||
2292 | * sysctl handler for numa_zonelist_order | 2595 | * sysctl handler for numa_zonelist_order |
2293 | */ | 2596 | */ |
2294 | int numa_zonelist_order_handler(ctl_table *table, int write, | 2597 | int numa_zonelist_order_handler(ctl_table *table, int write, |
2295 | struct file *file, void __user *buffer, size_t *length, | 2598 | void __user *buffer, size_t *length, |
2296 | loff_t *ppos) | 2599 | loff_t *ppos) |
2297 | { | 2600 | { |
2298 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2601 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
2299 | int ret; | 2602 | int ret; |
2603 | static DEFINE_MUTEX(zl_order_mutex); | ||
2300 | 2604 | ||
2605 | mutex_lock(&zl_order_mutex); | ||
2301 | if (write) | 2606 | if (write) |
2302 | strncpy(saved_string, (char*)table->data, | 2607 | strcpy(saved_string, (char*)table->data); |
2303 | NUMA_ZONELIST_ORDER_LEN); | 2608 | ret = proc_dostring(table, write, buffer, length, ppos); |
2304 | ret = proc_dostring(table, write, file, buffer, length, ppos); | ||
2305 | if (ret) | 2609 | if (ret) |
2306 | return ret; | 2610 | goto out; |
2307 | if (write) { | 2611 | if (write) { |
2308 | int oldval = user_zonelist_order; | 2612 | int oldval = user_zonelist_order; |
2309 | if (__parse_numa_zonelist_order((char*)table->data)) { | 2613 | if (__parse_numa_zonelist_order((char*)table->data)) { |
@@ -2313,10 +2617,15 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
2313 | strncpy((char*)table->data, saved_string, | 2617 | strncpy((char*)table->data, saved_string, |
2314 | NUMA_ZONELIST_ORDER_LEN); | 2618 | NUMA_ZONELIST_ORDER_LEN); |
2315 | user_zonelist_order = oldval; | 2619 | user_zonelist_order = oldval; |
2316 | } else if (oldval != user_zonelist_order) | 2620 | } else if (oldval != user_zonelist_order) { |
2317 | build_all_zonelists(); | 2621 | mutex_lock(&zonelists_mutex); |
2622 | build_all_zonelists(NULL); | ||
2623 | mutex_unlock(&zonelists_mutex); | ||
2624 | } | ||
2318 | } | 2625 | } |
2319 | return 0; | 2626 | out: |
2627 | mutex_unlock(&zl_order_mutex); | ||
2628 | return ret; | ||
2320 | } | 2629 | } |
2321 | 2630 | ||
2322 | 2631 | ||
@@ -2456,10 +2765,10 @@ static int default_zonelist_order(void) | |||
2456 | struct zone *z; | 2765 | struct zone *z; |
2457 | int average_size; | 2766 | int average_size; |
2458 | /* | 2767 | /* |
2459 | * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. | 2768 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
2460 | * If they are really small and used heavily, the system can fall | 2769 | * If they are really small and used heavily, the system can fall |
2461 | * into OOM very easily. | 2770 | * into OOM very easily. |
2462 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | 2771 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
2463 | */ | 2772 | */ |
2464 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | 2773 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ |
2465 | low_kmem_size = 0; | 2774 | low_kmem_size = 0; |
@@ -2471,6 +2780,15 @@ static int default_zonelist_order(void) | |||
2471 | if (zone_type < ZONE_NORMAL) | 2780 | if (zone_type < ZONE_NORMAL) |
2472 | low_kmem_size += z->present_pages; | 2781 | low_kmem_size += z->present_pages; |
2473 | total_size += z->present_pages; | 2782 | total_size += z->present_pages; |
2783 | } else if (zone_type == ZONE_NORMAL) { | ||
2784 | /* | ||
2785 | * If any node has only lowmem, then node order | ||
2786 | * is preferred to allow kernel allocations | ||
2787 | * locally; otherwise, they can easily infringe | ||
2788 | * on other nodes when there is an abundance of | ||
2789 | * lowmem available to allocate from. | ||
2790 | */ | ||
2791 | return ZONELIST_ORDER_NODE; | ||
2474 | } | 2792 | } |
2475 | } | 2793 | } |
2476 | } | 2794 | } |
@@ -2533,7 +2851,6 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2533 | prev_node = local_node; | 2851 | prev_node = local_node; |
2534 | nodes_clear(used_mask); | 2852 | nodes_clear(used_mask); |
2535 | 2853 | ||
2536 | memset(node_load, 0, sizeof(node_load)); | ||
2537 | memset(node_order, 0, sizeof(node_order)); | 2854 | memset(node_order, 0, sizeof(node_order)); |
2538 | j = 0; | 2855 | j = 0; |
2539 | 2856 | ||
@@ -2585,6 +2902,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2585 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); | 2902 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
2586 | } | 2903 | } |
2587 | 2904 | ||
2905 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
2906 | /* | ||
2907 | * Return node id of node used for "local" allocations. | ||
2908 | * I.e., first node id of first zone in arg node's generic zonelist. | ||
2909 | * Used for initializing percpu 'numa_mem', which is used primarily | ||
2910 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. | ||
2911 | */ | ||
2912 | int local_memory_node(int node) | ||
2913 | { | ||
2914 | struct zone *zone; | ||
2915 | |||
2916 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), | ||
2917 | gfp_zone(GFP_KERNEL), | ||
2918 | NULL, | ||
2919 | &zone); | ||
2920 | return zone->node; | ||
2921 | } | ||
2922 | #endif | ||
2588 | 2923 | ||
2589 | #else /* CONFIG_NUMA */ | 2924 | #else /* CONFIG_NUMA */ |
2590 | 2925 | ||
@@ -2637,21 +2972,85 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2637 | 2972 | ||
2638 | #endif /* CONFIG_NUMA */ | 2973 | #endif /* CONFIG_NUMA */ |
2639 | 2974 | ||
2975 | /* | ||
2976 | * Boot pageset table. One per cpu which is going to be used for all | ||
2977 | * zones and all nodes. The parameters will be set in such a way | ||
2978 | * that an item put on a list will immediately be handed over to | ||
2979 | * the buddy list. This is safe since pageset manipulation is done | ||
2980 | * with interrupts disabled. | ||
2981 | * | ||
2982 | * The boot_pagesets must be kept even after bootup is complete for | ||
2983 | * unused processors and/or zones. They do play a role for bootstrapping | ||
2984 | * hotplugged processors. | ||
2985 | * | ||
2986 | * zoneinfo_show() and maybe other functions do | ||
2987 | * not check if the processor is online before following the pageset pointer. | ||
2988 | * Other parts of the kernel may not check if the zone is available. | ||
2989 | */ | ||
2990 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
2991 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
2992 | static void setup_zone_pageset(struct zone *zone); | ||
2993 | |||
2994 | /* | ||
2995 | * Global mutex to protect against size modification of zonelists | ||
2996 | * as well as to serialize pageset setup for the new populated zone. | ||
2997 | */ | ||
2998 | DEFINE_MUTEX(zonelists_mutex); | ||
2999 | |||
2640 | /* return values int ....just for stop_machine() */ | 3000 | /* return values int ....just for stop_machine() */ |
2641 | static int __build_all_zonelists(void *dummy) | 3001 | static __init_refok int __build_all_zonelists(void *data) |
2642 | { | 3002 | { |
2643 | int nid; | 3003 | int nid; |
3004 | int cpu; | ||
2644 | 3005 | ||
3006 | #ifdef CONFIG_NUMA | ||
3007 | memset(node_load, 0, sizeof(node_load)); | ||
3008 | #endif | ||
2645 | for_each_online_node(nid) { | 3009 | for_each_online_node(nid) { |
2646 | pg_data_t *pgdat = NODE_DATA(nid); | 3010 | pg_data_t *pgdat = NODE_DATA(nid); |
2647 | 3011 | ||
2648 | build_zonelists(pgdat); | 3012 | build_zonelists(pgdat); |
2649 | build_zonelist_cache(pgdat); | 3013 | build_zonelist_cache(pgdat); |
2650 | } | 3014 | } |
3015 | |||
3016 | /* | ||
3017 | * Initialize the boot_pagesets that are going to be used | ||
3018 | * for bootstrapping processors. The real pagesets for | ||
3019 | * each zone will be allocated later when the per cpu | ||
3020 | * allocator is available. | ||
3021 | * | ||
3022 | * boot_pagesets are used also for bootstrapping offline | ||
3023 | * cpus if the system is already booted because the pagesets | ||
3024 | * are needed to initialize allocators on a specific cpu too. | ||
3025 | * F.e. the percpu allocator needs the page allocator which | ||
3026 | * needs the percpu allocator in order to allocate its pagesets | ||
3027 | * (a chicken-egg dilemma). | ||
3028 | */ | ||
3029 | for_each_possible_cpu(cpu) { | ||
3030 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
3031 | |||
3032 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
3033 | /* | ||
3034 | * We now know the "local memory node" for each node-- | ||
3035 | * i.e., the node of the first zone in the generic zonelist. | ||
3036 | * Set up numa_mem percpu variable for on-line cpus. During | ||
3037 | * boot, only the boot cpu should be on-line; we'll init the | ||
3038 | * secondary cpus' numa_mem as they come on-line. During | ||
3039 | * node/memory hotplug, we'll fixup all on-line cpus. | ||
3040 | */ | ||
3041 | if (cpu_online(cpu)) | ||
3042 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | ||
3043 | #endif | ||
3044 | } | ||
3045 | |||
2651 | return 0; | 3046 | return 0; |
2652 | } | 3047 | } |
2653 | 3048 | ||
2654 | void build_all_zonelists(void) | 3049 | /* |
3050 | * Called with zonelists_mutex held always | ||
3051 | * unless system_state == SYSTEM_BOOTING. | ||
3052 | */ | ||
3053 | void build_all_zonelists(void *data) | ||
2655 | { | 3054 | { |
2656 | set_zonelist_order(); | 3055 | set_zonelist_order(); |
2657 | 3056 | ||
@@ -2662,6 +3061,10 @@ void build_all_zonelists(void) | |||
2662 | } else { | 3061 | } else { |
2663 | /* we have to stop all cpus to guarantee there is no user | 3062 | /* we have to stop all cpus to guarantee there is no user |
2664 | of zonelist */ | 3063 | of zonelist */ |
3064 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
3065 | if (data) | ||
3066 | setup_zone_pageset((struct zone *)data); | ||
3067 | #endif | ||
2665 | stop_machine(__build_all_zonelists, NULL, NULL); | 3068 | stop_machine(__build_all_zonelists, NULL, NULL); |
2666 | /* cpuset refresh routine should be here */ | 3069 | /* cpuset refresh routine should be here */ |
2667 | } | 3070 | } |
@@ -2768,7 +3171,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2768 | { | 3171 | { |
2769 | unsigned long start_pfn, pfn, end_pfn; | 3172 | unsigned long start_pfn, pfn, end_pfn; |
2770 | struct page *page; | 3173 | struct page *page; |
2771 | unsigned long reserve, block_migratetype; | 3174 | unsigned long block_migratetype; |
3175 | int reserve; | ||
2772 | 3176 | ||
2773 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 3177 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
2774 | start_pfn = zone->zone_start_pfn; | 3178 | start_pfn = zone->zone_start_pfn; |
@@ -2776,6 +3180,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
2776 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3180 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
2777 | pageblock_order; | 3181 | pageblock_order; |
2778 | 3182 | ||
3183 | /* | ||
3184 | * Reserve blocks are generally in place to help high-order atomic | ||
3185 | * allocations that are short-lived. A min_free_kbytes value that | ||
3186 | * would result in more than 2 reserve blocks for atomic allocations | ||
3187 | * is assumed to be in place to help anti-fragmentation for the | ||
3188 | * future allocation of hugepages at runtime. | ||
3189 | */ | ||
3190 | reserve = min(2, reserve); | ||
3191 | |||
2779 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3192 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
2780 | if (!pfn_valid(pfn)) | 3193 | if (!pfn_valid(pfn)) |
2781 | continue; | 3194 | continue; |
@@ -2946,6 +3359,7 @@ static int zone_batchsize(struct zone *zone) | |||
2946 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 3359 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
2947 | { | 3360 | { |
2948 | struct per_cpu_pages *pcp; | 3361 | struct per_cpu_pages *pcp; |
3362 | int migratetype; | ||
2949 | 3363 | ||
2950 | memset(p, 0, sizeof(*p)); | 3364 | memset(p, 0, sizeof(*p)); |
2951 | 3365 | ||
@@ -2953,7 +3367,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
2953 | pcp->count = 0; | 3367 | pcp->count = 0; |
2954 | pcp->high = 6 * batch; | 3368 | pcp->high = 6 * batch; |
2955 | pcp->batch = max(1UL, 1 * batch); | 3369 | pcp->batch = max(1UL, 1 * batch); |
2956 | INIT_LIST_HEAD(&pcp->list); | 3370 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
3371 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | ||
2957 | } | 3372 | } |
2958 | 3373 | ||
2959 | /* | 3374 | /* |
@@ -2973,121 +3388,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
2973 | pcp->batch = PAGE_SHIFT * 8; | 3388 | pcp->batch = PAGE_SHIFT * 8; |
2974 | } | 3389 | } |
2975 | 3390 | ||
2976 | 3391 | static __meminit void setup_zone_pageset(struct zone *zone) | |
2977 | #ifdef CONFIG_NUMA | ||
2978 | /* | ||
2979 | * Boot pageset table. One per cpu which is going to be used for all | ||
2980 | * zones and all nodes. The parameters will be set in such a way | ||
2981 | * that an item put on a list will immediately be handed over to | ||
2982 | * the buddy list. This is safe since pageset manipulation is done | ||
2983 | * with interrupts disabled. | ||
2984 | * | ||
2985 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
2986 | * | ||
2987 | * The boot_pagesets must be kept even after bootup is complete for | ||
2988 | * unused processors and/or zones. They do play a role for bootstrapping | ||
2989 | * hotplugged processors. | ||
2990 | * | ||
2991 | * zoneinfo_show() and maybe other functions do | ||
2992 | * not check if the processor is online before following the pageset pointer. | ||
2993 | * Other parts of the kernel may not check if the zone is available. | ||
2994 | */ | ||
2995 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
2996 | |||
2997 | /* | ||
2998 | * Dynamically allocate memory for the | ||
2999 | * per cpu pageset array in struct zone. | ||
3000 | */ | ||
3001 | static int __cpuinit process_zones(int cpu) | ||
3002 | { | 3392 | { |
3003 | struct zone *zone, *dzone; | 3393 | int cpu; |
3004 | int node = cpu_to_node(cpu); | ||
3005 | 3394 | ||
3006 | node_set_state(node, N_CPU); /* this node has a cpu */ | 3395 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3007 | 3396 | ||
3008 | for_each_populated_zone(zone) { | 3397 | for_each_possible_cpu(cpu) { |
3009 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3398 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); |
3010 | GFP_KERNEL, node); | ||
3011 | if (!zone_pcp(zone, cpu)) | ||
3012 | goto bad; | ||
3013 | 3399 | ||
3014 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 3400 | setup_pageset(pcp, zone_batchsize(zone)); |
3015 | 3401 | ||
3016 | if (percpu_pagelist_fraction) | 3402 | if (percpu_pagelist_fraction) |
3017 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 3403 | setup_pagelist_highmark(pcp, |
3018 | (zone->present_pages / percpu_pagelist_fraction)); | 3404 | (zone->present_pages / |
3019 | } | 3405 | percpu_pagelist_fraction)); |
3020 | |||
3021 | return 0; | ||
3022 | bad: | ||
3023 | for_each_zone(dzone) { | ||
3024 | if (!populated_zone(dzone)) | ||
3025 | continue; | ||
3026 | if (dzone == zone) | ||
3027 | break; | ||
3028 | kfree(zone_pcp(dzone, cpu)); | ||
3029 | zone_pcp(dzone, cpu) = NULL; | ||
3030 | } | 3406 | } |
3031 | return -ENOMEM; | ||
3032 | } | 3407 | } |
3033 | 3408 | ||
3034 | static inline void free_zone_pagesets(int cpu) | 3409 | /* |
3035 | { | 3410 | * Allocate per cpu pagesets and initialize them. |
3036 | struct zone *zone; | 3411 | * Before this call only boot pagesets were available. |
3037 | 3412 | */ | |
3038 | for_each_zone(zone) { | ||
3039 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
3040 | |||
3041 | /* Free per_cpu_pageset if it is slab allocated */ | ||
3042 | if (pset != &boot_pageset[cpu]) | ||
3043 | kfree(pset); | ||
3044 | zone_pcp(zone, cpu) = NULL; | ||
3045 | } | ||
3046 | } | ||
3047 | |||
3048 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
3049 | unsigned long action, | ||
3050 | void *hcpu) | ||
3051 | { | ||
3052 | int cpu = (long)hcpu; | ||
3053 | int ret = NOTIFY_OK; | ||
3054 | |||
3055 | switch (action) { | ||
3056 | case CPU_UP_PREPARE: | ||
3057 | case CPU_UP_PREPARE_FROZEN: | ||
3058 | if (process_zones(cpu)) | ||
3059 | ret = NOTIFY_BAD; | ||
3060 | break; | ||
3061 | case CPU_UP_CANCELED: | ||
3062 | case CPU_UP_CANCELED_FROZEN: | ||
3063 | case CPU_DEAD: | ||
3064 | case CPU_DEAD_FROZEN: | ||
3065 | free_zone_pagesets(cpu); | ||
3066 | break; | ||
3067 | default: | ||
3068 | break; | ||
3069 | } | ||
3070 | return ret; | ||
3071 | } | ||
3072 | |||
3073 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
3074 | { &pageset_cpuup_callback, NULL, 0 }; | ||
3075 | |||
3076 | void __init setup_per_cpu_pageset(void) | 3413 | void __init setup_per_cpu_pageset(void) |
3077 | { | 3414 | { |
3078 | int err; | 3415 | struct zone *zone; |
3079 | 3416 | ||
3080 | /* Initialize per_cpu_pageset for cpu 0. | 3417 | for_each_populated_zone(zone) |
3081 | * A cpuup callback will do this for every cpu | 3418 | setup_zone_pageset(zone); |
3082 | * as it comes online | ||
3083 | */ | ||
3084 | err = process_zones(smp_processor_id()); | ||
3085 | BUG_ON(err); | ||
3086 | register_cpu_notifier(&pageset_notifier); | ||
3087 | } | 3419 | } |
3088 | 3420 | ||
3089 | #endif | ||
3090 | |||
3091 | static noinline __init_refok | 3421 | static noinline __init_refok |
3092 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3422 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3093 | { | 3423 | { |
@@ -3131,23 +3461,45 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3131 | return 0; | 3461 | return 0; |
3132 | } | 3462 | } |
3133 | 3463 | ||
3134 | static __meminit void zone_pcp_init(struct zone *zone) | 3464 | static int __zone_pcp_update(void *data) |
3135 | { | 3465 | { |
3466 | struct zone *zone = data; | ||
3136 | int cpu; | 3467 | int cpu; |
3137 | unsigned long batch = zone_batchsize(zone); | 3468 | unsigned long batch = zone_batchsize(zone), flags; |
3138 | 3469 | ||
3139 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 3470 | for_each_possible_cpu(cpu) { |
3140 | #ifdef CONFIG_NUMA | 3471 | struct per_cpu_pageset *pset; |
3141 | /* Early boot. Slab allocator not functional yet */ | 3472 | struct per_cpu_pages *pcp; |
3142 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | 3473 | |
3143 | setup_pageset(&boot_pageset[cpu],0); | 3474 | pset = per_cpu_ptr(zone->pageset, cpu); |
3144 | #else | 3475 | pcp = &pset->pcp; |
3145 | setup_pageset(zone_pcp(zone,cpu), batch); | 3476 | |
3146 | #endif | 3477 | local_irq_save(flags); |
3478 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3479 | setup_pageset(pset, batch); | ||
3480 | local_irq_restore(flags); | ||
3147 | } | 3481 | } |
3482 | return 0; | ||
3483 | } | ||
3484 | |||
3485 | void zone_pcp_update(struct zone *zone) | ||
3486 | { | ||
3487 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3488 | } | ||
3489 | |||
3490 | static __meminit void zone_pcp_init(struct zone *zone) | ||
3491 | { | ||
3492 | /* | ||
3493 | * per cpu subsystem is not up at this point. The following code | ||
3494 | * relies on the ability of the linker to provide the | ||
3495 | * offset of a (static) per cpu variable into the per cpu area. | ||
3496 | */ | ||
3497 | zone->pageset = &boot_pageset; | ||
3498 | |||
3148 | if (zone->present_pages) | 3499 | if (zone->present_pages) |
3149 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3500 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3150 | zone->name, zone->present_pages, batch); | 3501 | zone->name, zone->present_pages, |
3502 | zone_batchsize(zone)); | ||
3151 | } | 3503 | } |
3152 | 3504 | ||
3153 | __meminit int init_currently_empty_zone(struct zone *zone, | 3505 | __meminit int init_currently_empty_zone(struct zone *zone, |
@@ -3286,6 +3638,84 @@ void __init free_bootmem_with_active_regions(int nid, | |||
3286 | } | 3638 | } |
3287 | } | 3639 | } |
3288 | 3640 | ||
3641 | #ifdef CONFIG_HAVE_MEMBLOCK | ||
3642 | u64 __init find_memory_core_early(int nid, u64 size, u64 align, | ||
3643 | u64 goal, u64 limit) | ||
3644 | { | ||
3645 | int i; | ||
3646 | |||
3647 | /* Need to go over early_node_map to find out good range for node */ | ||
3648 | for_each_active_range_index_in_nid(i, nid) { | ||
3649 | u64 addr; | ||
3650 | u64 ei_start, ei_last; | ||
3651 | u64 final_start, final_end; | ||
3652 | |||
3653 | ei_last = early_node_map[i].end_pfn; | ||
3654 | ei_last <<= PAGE_SHIFT; | ||
3655 | ei_start = early_node_map[i].start_pfn; | ||
3656 | ei_start <<= PAGE_SHIFT; | ||
3657 | |||
3658 | final_start = max(ei_start, goal); | ||
3659 | final_end = min(ei_last, limit); | ||
3660 | |||
3661 | if (final_start >= final_end) | ||
3662 | continue; | ||
3663 | |||
3664 | addr = memblock_find_in_range(final_start, final_end, size, align); | ||
3665 | |||
3666 | if (addr == MEMBLOCK_ERROR) | ||
3667 | continue; | ||
3668 | |||
3669 | return addr; | ||
3670 | } | ||
3671 | |||
3672 | return MEMBLOCK_ERROR; | ||
3673 | } | ||
3674 | #endif | ||
3675 | |||
3676 | int __init add_from_early_node_map(struct range *range, int az, | ||
3677 | int nr_range, int nid) | ||
3678 | { | ||
3679 | int i; | ||
3680 | u64 start, end; | ||
3681 | |||
3682 | /* need to go over early_node_map to find out good range for node */ | ||
3683 | for_each_active_range_index_in_nid(i, nid) { | ||
3684 | start = early_node_map[i].start_pfn; | ||
3685 | end = early_node_map[i].end_pfn; | ||
3686 | nr_range = add_range(range, az, nr_range, start, end); | ||
3687 | } | ||
3688 | return nr_range; | ||
3689 | } | ||
3690 | |||
3691 | #ifdef CONFIG_NO_BOOTMEM | ||
3692 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
3693 | u64 goal, u64 limit) | ||
3694 | { | ||
3695 | void *ptr; | ||
3696 | u64 addr; | ||
3697 | |||
3698 | if (limit > memblock.current_limit) | ||
3699 | limit = memblock.current_limit; | ||
3700 | |||
3701 | addr = find_memory_core_early(nid, size, align, goal, limit); | ||
3702 | |||
3703 | if (addr == MEMBLOCK_ERROR) | ||
3704 | return NULL; | ||
3705 | |||
3706 | ptr = phys_to_virt(addr); | ||
3707 | memset(ptr, 0, size); | ||
3708 | memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); | ||
3709 | /* | ||
3710 | * The min_count is set to 0 so that bootmem allocated blocks | ||
3711 | * are never reported as leaks. | ||
3712 | */ | ||
3713 | kmemleak_alloc(ptr, size, 0, 0); | ||
3714 | return ptr; | ||
3715 | } | ||
3716 | #endif | ||
3717 | |||
3718 | |||
3289 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3719 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
3290 | { | 3720 | { |
3291 | int i; | 3721 | int i; |
@@ -3435,7 +3865,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
3435 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3865 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
3436 | * then all holes in the requested range will be accounted for. | 3866 | * then all holes in the requested range will be accounted for. |
3437 | */ | 3867 | */ |
3438 | static unsigned long __meminit __absent_pages_in_range(int nid, | 3868 | unsigned long __meminit __absent_pages_in_range(int nid, |
3439 | unsigned long range_start_pfn, | 3869 | unsigned long range_start_pfn, |
3440 | unsigned long range_end_pfn) | 3870 | unsigned long range_end_pfn) |
3441 | { | 3871 | { |
@@ -3700,12 +4130,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
3700 | zone_seqlock_init(zone); | 4130 | zone_seqlock_init(zone); |
3701 | zone->zone_pgdat = pgdat; | 4131 | zone->zone_pgdat = pgdat; |
3702 | 4132 | ||
3703 | zone->prev_priority = DEF_PRIORITY; | ||
3704 | |||
3705 | zone_pcp_init(zone); | 4133 | zone_pcp_init(zone); |
3706 | for_each_lru(l) { | 4134 | for_each_lru(l) { |
3707 | INIT_LIST_HEAD(&zone->lru[l].list); | 4135 | INIT_LIST_HEAD(&zone->lru[l].list); |
3708 | zone->lru[l].nr_saved_scan = 0; | 4136 | zone->reclaim_stat.nr_saved_scan[l] = 0; |
3709 | } | 4137 | } |
3710 | zone->reclaim_stat.recent_rotated[0] = 0; | 4138 | zone->reclaim_stat.recent_rotated[0] = 0; |
3711 | zone->reclaim_stat.recent_rotated[1] = 0; | 4139 | zone->reclaim_stat.recent_rotated[1] = 0; |
@@ -3850,7 +4278,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
3850 | } | 4278 | } |
3851 | 4279 | ||
3852 | /* Merge backward if suitable */ | 4280 | /* Merge backward if suitable */ |
3853 | if (start_pfn < early_node_map[i].end_pfn && | 4281 | if (start_pfn < early_node_map[i].start_pfn && |
3854 | end_pfn >= early_node_map[i].start_pfn) { | 4282 | end_pfn >= early_node_map[i].start_pfn) { |
3855 | early_node_map[i].start_pfn = start_pfn; | 4283 | early_node_map[i].start_pfn = start_pfn; |
3856 | return; | 4284 | return; |
@@ -3964,7 +4392,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) | |||
3964 | } | 4392 | } |
3965 | 4393 | ||
3966 | /* sort the node_map by start_pfn */ | 4394 | /* sort the node_map by start_pfn */ |
3967 | static void __init sort_node_map(void) | 4395 | void __init sort_node_map(void) |
3968 | { | 4396 | { |
3969 | sort(early_node_map, (size_t)nr_nodemap_entries, | 4397 | sort(early_node_map, (size_t)nr_nodemap_entries, |
3970 | sizeof(struct node_active_region), | 4398 | sizeof(struct node_active_region), |
@@ -4032,6 +4460,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
4032 | int i, nid; | 4460 | int i, nid; |
4033 | unsigned long usable_startpfn; | 4461 | unsigned long usable_startpfn; |
4034 | unsigned long kernelcore_node, kernelcore_remaining; | 4462 | unsigned long kernelcore_node, kernelcore_remaining; |
4463 | /* save the state before borrow the nodemask */ | ||
4464 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | ||
4035 | unsigned long totalpages = early_calculate_totalpages(); | 4465 | unsigned long totalpages = early_calculate_totalpages(); |
4036 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4466 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
4037 | 4467 | ||
@@ -4059,7 +4489,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
4059 | 4489 | ||
4060 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4490 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ |
4061 | if (!required_kernelcore) | 4491 | if (!required_kernelcore) |
4062 | return; | 4492 | goto out; |
4063 | 4493 | ||
4064 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4494 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
4065 | find_usable_zone_for_movable(); | 4495 | find_usable_zone_for_movable(); |
@@ -4158,6 +4588,10 @@ restart: | |||
4158 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 4588 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
4159 | zone_movable_pfn[nid] = | 4589 | zone_movable_pfn[nid] = |
4160 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 4590 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
4591 | |||
4592 | out: | ||
4593 | /* restore the node_state */ | ||
4594 | node_states[N_HIGH_MEMORY] = saved_node_state; | ||
4161 | } | 4595 | } |
4162 | 4596 | ||
4163 | /* Any regular memory on that node ? */ | 4597 | /* Any regular memory on that node ? */ |
@@ -4222,8 +4656,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4222 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4656 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4223 | if (i == ZONE_MOVABLE) | 4657 | if (i == ZONE_MOVABLE) |
4224 | continue; | 4658 | continue; |
4225 | printk(" %-8s %0#10lx -> %0#10lx\n", | 4659 | printk(" %-8s ", zone_names[i]); |
4226 | zone_names[i], | 4660 | if (arch_zone_lowest_possible_pfn[i] == |
4661 | arch_zone_highest_possible_pfn[i]) | ||
4662 | printk("empty\n"); | ||
4663 | else | ||
4664 | printk("%0#10lx -> %0#10lx\n", | ||
4227 | arch_zone_lowest_possible_pfn[i], | 4665 | arch_zone_lowest_possible_pfn[i], |
4228 | arch_zone_highest_possible_pfn[i]); | 4666 | arch_zone_highest_possible_pfn[i]); |
4229 | } | 4667 | } |
@@ -4242,11 +4680,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4242 | early_node_map[i].start_pfn, | 4680 | early_node_map[i].start_pfn, |
4243 | early_node_map[i].end_pfn); | 4681 | early_node_map[i].end_pfn); |
4244 | 4682 | ||
4245 | /* | ||
4246 | * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init | ||
4247 | * that node_mask, clear it at first | ||
4248 | */ | ||
4249 | nodes_clear(node_states[N_HIGH_MEMORY]); | ||
4250 | /* Initialise every node */ | 4683 | /* Initialise every node */ |
4251 | mminit_verify_pageflags_layout(); | 4684 | mminit_verify_pageflags_layout(); |
4252 | setup_nr_node_ids(); | 4685 | setup_nr_node_ids(); |
@@ -4317,7 +4750,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
4317 | } | 4750 | } |
4318 | 4751 | ||
4319 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4752 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4320 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4753 | struct pglist_data __refdata contig_page_data = { |
4754 | #ifndef CONFIG_NO_BOOTMEM | ||
4755 | .bdata = &bootmem_node_data[0] | ||
4756 | #endif | ||
4757 | }; | ||
4321 | EXPORT_SYMBOL(contig_page_data); | 4758 | EXPORT_SYMBOL(contig_page_data); |
4322 | #endif | 4759 | #endif |
4323 | 4760 | ||
@@ -4493,7 +4930,7 @@ void setup_per_zone_wmarks(void) | |||
4493 | calculate_totalreserve_pages(); | 4930 | calculate_totalreserve_pages(); |
4494 | } | 4931 | } |
4495 | 4932 | ||
4496 | /** | 4933 | /* |
4497 | * The inactive anon list should be small enough that the VM never has to | 4934 | * The inactive anon list should be small enough that the VM never has to |
4498 | * do too much work, but large enough that each inactive page has a chance | 4935 | * do too much work, but large enough that each inactive page has a chance |
4499 | * to be referenced again before it is swapped out. | 4936 | * to be referenced again before it is swapped out. |
@@ -4584,9 +5021,9 @@ module_init(init_per_zone_wmark_min) | |||
4584 | * changes. | 5021 | * changes. |
4585 | */ | 5022 | */ |
4586 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5023 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
4587 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 5024 | void __user *buffer, size_t *length, loff_t *ppos) |
4588 | { | 5025 | { |
4589 | proc_dointvec(table, write, file, buffer, length, ppos); | 5026 | proc_dointvec(table, write, buffer, length, ppos); |
4590 | if (write) | 5027 | if (write) |
4591 | setup_per_zone_wmarks(); | 5028 | setup_per_zone_wmarks(); |
4592 | return 0; | 5029 | return 0; |
@@ -4594,12 +5031,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
4594 | 5031 | ||
4595 | #ifdef CONFIG_NUMA | 5032 | #ifdef CONFIG_NUMA |
4596 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 5033 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
4597 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 5034 | void __user *buffer, size_t *length, loff_t *ppos) |
4598 | { | 5035 | { |
4599 | struct zone *zone; | 5036 | struct zone *zone; |
4600 | int rc; | 5037 | int rc; |
4601 | 5038 | ||
4602 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 5039 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4603 | if (rc) | 5040 | if (rc) |
4604 | return rc; | 5041 | return rc; |
4605 | 5042 | ||
@@ -4610,12 +5047,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
4610 | } | 5047 | } |
4611 | 5048 | ||
4612 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 5049 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
4613 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 5050 | void __user *buffer, size_t *length, loff_t *ppos) |
4614 | { | 5051 | { |
4615 | struct zone *zone; | 5052 | struct zone *zone; |
4616 | int rc; | 5053 | int rc; |
4617 | 5054 | ||
4618 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 5055 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4619 | if (rc) | 5056 | if (rc) |
4620 | return rc; | 5057 | return rc; |
4621 | 5058 | ||
@@ -4636,9 +5073,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
4636 | * if in function of the boot time zone sizes. | 5073 | * if in function of the boot time zone sizes. |
4637 | */ | 5074 | */ |
4638 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 5075 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
4639 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 5076 | void __user *buffer, size_t *length, loff_t *ppos) |
4640 | { | 5077 | { |
4641 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 5078 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
4642 | setup_per_zone_lowmem_reserve(); | 5079 | setup_per_zone_lowmem_reserve(); |
4643 | return 0; | 5080 | return 0; |
4644 | } | 5081 | } |
@@ -4650,20 +5087,21 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
4650 | */ | 5087 | */ |
4651 | 5088 | ||
4652 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5089 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
4653 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 5090 | void __user *buffer, size_t *length, loff_t *ppos) |
4654 | { | 5091 | { |
4655 | struct zone *zone; | 5092 | struct zone *zone; |
4656 | unsigned int cpu; | 5093 | unsigned int cpu; |
4657 | int ret; | 5094 | int ret; |
4658 | 5095 | ||
4659 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 5096 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
4660 | if (!write || (ret == -EINVAL)) | 5097 | if (!write || (ret == -EINVAL)) |
4661 | return ret; | 5098 | return ret; |
4662 | for_each_zone(zone) { | 5099 | for_each_populated_zone(zone) { |
4663 | for_each_online_cpu(cpu) { | 5100 | for_each_possible_cpu(cpu) { |
4664 | unsigned long high; | 5101 | unsigned long high; |
4665 | high = zone->present_pages / percpu_pagelist_fraction; | 5102 | high = zone->present_pages / percpu_pagelist_fraction; |
4666 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 5103 | setup_pagelist_highmark( |
5104 | per_cpu_ptr(zone->pageset, cpu), high); | ||
4667 | } | 5105 | } |
4668 | } | 5106 | } |
4669 | return 0; | 5107 | return 0; |
@@ -4716,7 +5154,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4716 | numentries <<= (PAGE_SHIFT - scale); | 5154 | numentries <<= (PAGE_SHIFT - scale); |
4717 | 5155 | ||
4718 | /* Make sure we've got at least a 0-order allocation.. */ | 5156 | /* Make sure we've got at least a 0-order allocation.. */ |
4719 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | 5157 | if (unlikely(flags & HASH_SMALL)) { |
5158 | /* Makes no sense without HASH_EARLY */ | ||
5159 | WARN_ON(!(flags & HASH_EARLY)); | ||
5160 | if (!(numentries >> *_hash_shift)) { | ||
5161 | numentries = 1UL << *_hash_shift; | ||
5162 | BUG_ON(!numentries); | ||
5163 | } | ||
5164 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | ||
4720 | numentries = PAGE_SIZE / bucketsize; | 5165 | numentries = PAGE_SIZE / bucketsize; |
4721 | } | 5166 | } |
4722 | numentries = roundup_pow_of_two(numentries); | 5167 | numentries = roundup_pow_of_two(numentries); |
@@ -4744,17 +5189,19 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4744 | * some pages at the end of hash table which | 5189 | * some pages at the end of hash table which |
4745 | * alloc_pages_exact() automatically does | 5190 | * alloc_pages_exact() automatically does |
4746 | */ | 5191 | */ |
4747 | if (get_order(size) < MAX_ORDER) | 5192 | if (get_order(size) < MAX_ORDER) { |
4748 | table = alloc_pages_exact(size, GFP_ATOMIC); | 5193 | table = alloc_pages_exact(size, GFP_ATOMIC); |
5194 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
5195 | } | ||
4749 | } | 5196 | } |
4750 | } while (!table && size > PAGE_SIZE && --log2qty); | 5197 | } while (!table && size > PAGE_SIZE && --log2qty); |
4751 | 5198 | ||
4752 | if (!table) | 5199 | if (!table) |
4753 | panic("Failed to allocate %s hash table\n", tablename); | 5200 | panic("Failed to allocate %s hash table\n", tablename); |
4754 | 5201 | ||
4755 | printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", | 5202 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", |
4756 | tablename, | 5203 | tablename, |
4757 | (1U << log2qty), | 5204 | (1UL << log2qty), |
4758 | ilog2(size) - PAGE_SHIFT, | 5205 | ilog2(size) - PAGE_SHIFT, |
4759 | size); | 5206 | size); |
4760 | 5207 | ||
@@ -4763,16 +5210,6 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4763 | if (_hash_mask) | 5210 | if (_hash_mask) |
4764 | *_hash_mask = (1 << log2qty) - 1; | 5211 | *_hash_mask = (1 << log2qty) - 1; |
4765 | 5212 | ||
4766 | /* | ||
4767 | * If hashdist is set, the table allocation is done with __vmalloc() | ||
4768 | * which invokes the kmemleak_alloc() callback. This function may also | ||
4769 | * be called before the slab and kmemleak are initialised when | ||
4770 | * kmemleak simply buffers the request to be executed later | ||
4771 | * (GFP_ATOMIC flag ignored in this case). | ||
4772 | */ | ||
4773 | if (!hashdist) | ||
4774 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | ||
4775 | |||
4776 | return table; | 5213 | return table; |
4777 | } | 5214 | } |
4778 | 5215 | ||
@@ -4861,23 +5298,113 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
4861 | * page allocater never alloc memory from ISOLATE block. | 5298 | * page allocater never alloc memory from ISOLATE block. |
4862 | */ | 5299 | */ |
4863 | 5300 | ||
5301 | static int | ||
5302 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5303 | { | ||
5304 | unsigned long pfn, iter, found; | ||
5305 | /* | ||
5306 | * For avoiding noise data, lru_add_drain_all() should be called | ||
5307 | * If ZONE_MOVABLE, the zone never contains immobile pages | ||
5308 | */ | ||
5309 | if (zone_idx(zone) == ZONE_MOVABLE) | ||
5310 | return true; | ||
5311 | |||
5312 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) | ||
5313 | return true; | ||
5314 | |||
5315 | pfn = page_to_pfn(page); | ||
5316 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | ||
5317 | unsigned long check = pfn + iter; | ||
5318 | |||
5319 | if (!pfn_valid_within(check)) { | ||
5320 | iter++; | ||
5321 | continue; | ||
5322 | } | ||
5323 | page = pfn_to_page(check); | ||
5324 | if (!page_count(page)) { | ||
5325 | if (PageBuddy(page)) | ||
5326 | iter += (1 << page_order(page)) - 1; | ||
5327 | continue; | ||
5328 | } | ||
5329 | if (!PageLRU(page)) | ||
5330 | found++; | ||
5331 | /* | ||
5332 | * If there are RECLAIMABLE pages, we need to check it. | ||
5333 | * But now, memory offline itself doesn't call shrink_slab() | ||
5334 | * and it still to be fixed. | ||
5335 | */ | ||
5336 | /* | ||
5337 | * If the page is not RAM, page_count()should be 0. | ||
5338 | * we don't need more check. This is an _used_ not-movable page. | ||
5339 | * | ||
5340 | * The problematic thing here is PG_reserved pages. PG_reserved | ||
5341 | * is set to both of a memory hole page and a _used_ kernel | ||
5342 | * page at boot. | ||
5343 | */ | ||
5344 | if (found > count) | ||
5345 | return false; | ||
5346 | } | ||
5347 | return true; | ||
5348 | } | ||
5349 | |||
5350 | bool is_pageblock_removable_nolock(struct page *page) | ||
5351 | { | ||
5352 | struct zone *zone = page_zone(page); | ||
5353 | return __count_immobile_pages(zone, page, 0); | ||
5354 | } | ||
5355 | |||
4864 | int set_migratetype_isolate(struct page *page) | 5356 | int set_migratetype_isolate(struct page *page) |
4865 | { | 5357 | { |
4866 | struct zone *zone; | 5358 | struct zone *zone; |
4867 | unsigned long flags; | 5359 | unsigned long flags, pfn; |
5360 | struct memory_isolate_notify arg; | ||
5361 | int notifier_ret; | ||
4868 | int ret = -EBUSY; | 5362 | int ret = -EBUSY; |
5363 | int zone_idx; | ||
4869 | 5364 | ||
4870 | zone = page_zone(page); | 5365 | zone = page_zone(page); |
5366 | zone_idx = zone_idx(zone); | ||
5367 | |||
4871 | spin_lock_irqsave(&zone->lock, flags); | 5368 | spin_lock_irqsave(&zone->lock, flags); |
5369 | |||
5370 | pfn = page_to_pfn(page); | ||
5371 | arg.start_pfn = pfn; | ||
5372 | arg.nr_pages = pageblock_nr_pages; | ||
5373 | arg.pages_found = 0; | ||
5374 | |||
4872 | /* | 5375 | /* |
4873 | * In future, more migrate types will be able to be isolation target. | 5376 | * It may be possible to isolate a pageblock even if the |
5377 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5378 | * notifier chain is used by balloon drivers to return the | ||
5379 | * number of pages in a range that are held by the balloon | ||
5380 | * driver to shrink memory. If all the pages are accounted for | ||
5381 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5382 | * Later, for example, when memory hotplug notifier runs, these | ||
5383 | * pages reported as "can be isolated" should be isolated(freed) | ||
5384 | * by the balloon driver through the memory notifier chain. | ||
4874 | */ | 5385 | */ |
4875 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | 5386 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5387 | notifier_ret = notifier_to_errno(notifier_ret); | ||
5388 | if (notifier_ret) | ||
4876 | goto out; | 5389 | goto out; |
4877 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5390 | /* |
4878 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5391 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
4879 | ret = 0; | 5392 | * We just check MOVABLE pages. |
5393 | */ | ||
5394 | if (__count_immobile_pages(zone, page, arg.pages_found)) | ||
5395 | ret = 0; | ||
5396 | |||
5397 | /* | ||
5398 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5399 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5400 | */ | ||
5401 | |||
4880 | out: | 5402 | out: |
5403 | if (!ret) { | ||
5404 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5405 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5406 | } | ||
5407 | |||
4881 | spin_unlock_irqrestore(&zone->lock, flags); | 5408 | spin_unlock_irqrestore(&zone->lock, flags); |
4882 | if (!ret) | 5409 | if (!ret) |
4883 | drain_all_pages(); | 5410 | drain_all_pages(); |
@@ -4944,3 +5471,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
4944 | spin_unlock_irqrestore(&zone->lock, flags); | 5471 | spin_unlock_irqrestore(&zone->lock, flags); |
4945 | } | 5472 | } |
4946 | #endif | 5473 | #endif |
5474 | |||
5475 | #ifdef CONFIG_MEMORY_FAILURE | ||
5476 | bool is_free_buddy_page(struct page *page) | ||
5477 | { | ||
5478 | struct zone *zone = page_zone(page); | ||
5479 | unsigned long pfn = page_to_pfn(page); | ||
5480 | unsigned long flags; | ||
5481 | int order; | ||
5482 | |||
5483 | spin_lock_irqsave(&zone->lock, flags); | ||
5484 | for (order = 0; order < MAX_ORDER; order++) { | ||
5485 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
5486 | |||
5487 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
5488 | break; | ||
5489 | } | ||
5490 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5491 | |||
5492 | return order < MAX_ORDER; | ||
5493 | } | ||
5494 | #endif | ||
5495 | |||
5496 | static struct trace_print_flags pageflag_names[] = { | ||
5497 | {1UL << PG_locked, "locked" }, | ||
5498 | {1UL << PG_error, "error" }, | ||
5499 | {1UL << PG_referenced, "referenced" }, | ||
5500 | {1UL << PG_uptodate, "uptodate" }, | ||
5501 | {1UL << PG_dirty, "dirty" }, | ||
5502 | {1UL << PG_lru, "lru" }, | ||
5503 | {1UL << PG_active, "active" }, | ||
5504 | {1UL << PG_slab, "slab" }, | ||
5505 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
5506 | {1UL << PG_arch_1, "arch_1" }, | ||
5507 | {1UL << PG_reserved, "reserved" }, | ||
5508 | {1UL << PG_private, "private" }, | ||
5509 | {1UL << PG_private_2, "private_2" }, | ||
5510 | {1UL << PG_writeback, "writeback" }, | ||
5511 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
5512 | {1UL << PG_head, "head" }, | ||
5513 | {1UL << PG_tail, "tail" }, | ||
5514 | #else | ||
5515 | {1UL << PG_compound, "compound" }, | ||
5516 | #endif | ||
5517 | {1UL << PG_swapcache, "swapcache" }, | ||
5518 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
5519 | {1UL << PG_reclaim, "reclaim" }, | ||
5520 | {1UL << PG_buddy, "buddy" }, | ||
5521 | {1UL << PG_swapbacked, "swapbacked" }, | ||
5522 | {1UL << PG_unevictable, "unevictable" }, | ||
5523 | #ifdef CONFIG_MMU | ||
5524 | {1UL << PG_mlocked, "mlocked" }, | ||
5525 | #endif | ||
5526 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
5527 | {1UL << PG_uncached, "uncached" }, | ||
5528 | #endif | ||
5529 | #ifdef CONFIG_MEMORY_FAILURE | ||
5530 | {1UL << PG_hwpoison, "hwpoison" }, | ||
5531 | #endif | ||
5532 | {-1UL, NULL }, | ||
5533 | }; | ||
5534 | |||
5535 | static void dump_page_flags(unsigned long flags) | ||
5536 | { | ||
5537 | const char *delim = ""; | ||
5538 | unsigned long mask; | ||
5539 | int i; | ||
5540 | |||
5541 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
5542 | |||
5543 | /* remove zone id */ | ||
5544 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
5545 | |||
5546 | for (i = 0; pageflag_names[i].name && flags; i++) { | ||
5547 | |||
5548 | mask = pageflag_names[i].mask; | ||
5549 | if ((flags & mask) != mask) | ||
5550 | continue; | ||
5551 | |||
5552 | flags &= ~mask; | ||
5553 | printk("%s%s", delim, pageflag_names[i].name); | ||
5554 | delim = "|"; | ||
5555 | } | ||
5556 | |||
5557 | /* check for left over flags */ | ||
5558 | if (flags) | ||
5559 | printk("%s%#lx", delim, flags); | ||
5560 | |||
5561 | printk(")\n"); | ||
5562 | } | ||
5563 | |||
5564 | void dump_page(struct page *page) | ||
5565 | { | ||
5566 | printk(KERN_ALERT | ||
5567 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
5568 | page, page_count(page), page_mapcount(page), | ||
5569 | page->mapping, page->index); | ||
5570 | dump_page_flags(page->flags); | ||
5571 | } | ||