diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 731 |
1 files changed, 660 insertions, 71 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a8c59571cb7..d315e1127dc9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -41,24 +41,37 @@ | |||
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | 43 | #include <linux/fault-inject.h> |
44 | #include <linux/page-isolation.h> | ||
44 | 45 | ||
45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
46 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
47 | #include "internal.h" | 48 | #include "internal.h" |
48 | 49 | ||
49 | /* | 50 | /* |
50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 51 | * Array of node states. |
51 | * initializer cleaner | ||
52 | */ | 52 | */ |
53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 53 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
54 | EXPORT_SYMBOL(node_online_map); | 54 | [N_POSSIBLE] = NODE_MASK_ALL, |
55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 55 | [N_ONLINE] = { { [0] = 1UL } }, |
56 | EXPORT_SYMBOL(node_possible_map); | 56 | #ifndef CONFIG_NUMA |
57 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | ||
58 | #ifdef CONFIG_HIGHMEM | ||
59 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | ||
60 | #endif | ||
61 | [N_CPU] = { { [0] = 1UL } }, | ||
62 | #endif /* NUMA */ | ||
63 | }; | ||
64 | EXPORT_SYMBOL(node_states); | ||
65 | |||
57 | unsigned long totalram_pages __read_mostly; | 66 | unsigned long totalram_pages __read_mostly; |
58 | unsigned long totalreserve_pages __read_mostly; | 67 | unsigned long totalreserve_pages __read_mostly; |
59 | long nr_swap_pages; | 68 | long nr_swap_pages; |
60 | int percpu_pagelist_fraction; | 69 | int percpu_pagelist_fraction; |
61 | 70 | ||
71 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
72 | int pageblock_order __read_mostly; | ||
73 | #endif | ||
74 | |||
62 | static void __free_pages_ok(struct page *page, unsigned int order); | 75 | static void __free_pages_ok(struct page *page, unsigned int order); |
63 | 76 | ||
64 | /* | 77 | /* |
@@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve; | |||
137 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 150 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
138 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 151 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
139 | unsigned long __initdata required_kernelcore; | 152 | unsigned long __initdata required_kernelcore; |
140 | unsigned long __initdata required_movablecore; | 153 | static unsigned long __initdata required_movablecore; |
141 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 154 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
142 | 155 | ||
143 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 156 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
@@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES; | |||
150 | EXPORT_SYMBOL(nr_node_ids); | 163 | EXPORT_SYMBOL(nr_node_ids); |
151 | #endif | 164 | #endif |
152 | 165 | ||
166 | int page_group_by_mobility_disabled __read_mostly; | ||
167 | |||
168 | static void set_pageblock_migratetype(struct page *page, int migratetype) | ||
169 | { | ||
170 | set_pageblock_flags_group(page, (unsigned long)migratetype, | ||
171 | PB_migrate, PB_migrate_end); | ||
172 | } | ||
173 | |||
153 | #ifdef CONFIG_DEBUG_VM | 174 | #ifdef CONFIG_DEBUG_VM |
154 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 175 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
155 | { | 176 | { |
@@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
293 | clear_highpage(page + i); | 314 | clear_highpage(page + i); |
294 | } | 315 | } |
295 | 316 | ||
296 | /* | ||
297 | * function for dealing with page's order in buddy system. | ||
298 | * zone->lock is already acquired when we use these. | ||
299 | * So, we don't need atomic page->flags operations here. | ||
300 | */ | ||
301 | static inline unsigned long page_order(struct page *page) | ||
302 | { | ||
303 | return page_private(page); | ||
304 | } | ||
305 | |||
306 | static inline void set_page_order(struct page *page, int order) | 317 | static inline void set_page_order(struct page *page, int order) |
307 | { | 318 | { |
308 | set_page_private(page, order); | 319 | set_page_private(page, order); |
@@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page, | |||
404 | { | 415 | { |
405 | unsigned long page_idx; | 416 | unsigned long page_idx; |
406 | int order_size = 1 << order; | 417 | int order_size = 1 << order; |
418 | int migratetype = get_pageblock_migratetype(page); | ||
407 | 419 | ||
408 | if (unlikely(PageCompound(page))) | 420 | if (unlikely(PageCompound(page))) |
409 | destroy_compound_page(page, order); | 421 | destroy_compound_page(page, order); |
@@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page, | |||
416 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | 428 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); |
417 | while (order < MAX_ORDER-1) { | 429 | while (order < MAX_ORDER-1) { |
418 | unsigned long combined_idx; | 430 | unsigned long combined_idx; |
419 | struct free_area *area; | ||
420 | struct page *buddy; | 431 | struct page *buddy; |
421 | 432 | ||
422 | buddy = __page_find_buddy(page, page_idx, order); | 433 | buddy = __page_find_buddy(page, page_idx, order); |
@@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page, | |||
424 | break; /* Move the buddy up one level. */ | 435 | break; /* Move the buddy up one level. */ |
425 | 436 | ||
426 | list_del(&buddy->lru); | 437 | list_del(&buddy->lru); |
427 | area = zone->free_area + order; | 438 | zone->free_area[order].nr_free--; |
428 | area->nr_free--; | ||
429 | rmv_page_order(buddy); | 439 | rmv_page_order(buddy); |
430 | combined_idx = __find_combined_index(page_idx, order); | 440 | combined_idx = __find_combined_index(page_idx, order); |
431 | page = page + (combined_idx - page_idx); | 441 | page = page + (combined_idx - page_idx); |
@@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page, | |||
433 | order++; | 443 | order++; |
434 | } | 444 | } |
435 | set_page_order(page, order); | 445 | set_page_order(page, order); |
436 | list_add(&page->lru, &zone->free_area[order].free_list); | 446 | list_add(&page->lru, |
447 | &zone->free_area[order].free_list[migratetype]); | ||
437 | zone->free_area[order].nr_free++; | 448 | zone->free_area[order].nr_free++; |
438 | } | 449 | } |
439 | 450 | ||
@@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
567 | * -- wli | 578 | * -- wli |
568 | */ | 579 | */ |
569 | static inline void expand(struct zone *zone, struct page *page, | 580 | static inline void expand(struct zone *zone, struct page *page, |
570 | int low, int high, struct free_area *area) | 581 | int low, int high, struct free_area *area, |
582 | int migratetype) | ||
571 | { | 583 | { |
572 | unsigned long size = 1 << high; | 584 | unsigned long size = 1 << high; |
573 | 585 | ||
@@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
576 | high--; | 588 | high--; |
577 | size >>= 1; | 589 | size >>= 1; |
578 | VM_BUG_ON(bad_range(zone, &page[size])); | 590 | VM_BUG_ON(bad_range(zone, &page[size])); |
579 | list_add(&page[size].lru, &area->free_list); | 591 | list_add(&page[size].lru, &area->free_list[migratetype]); |
580 | area->nr_free++; | 592 | area->nr_free++; |
581 | set_page_order(&page[size], high); | 593 | set_page_order(&page[size], high); |
582 | } | 594 | } |
@@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
628 | return 0; | 640 | return 0; |
629 | } | 641 | } |
630 | 642 | ||
631 | /* | 643 | /* |
632 | * Do the hard work of removing an element from the buddy allocator. | 644 | * Go through the free lists for the given migratetype and remove |
633 | * Call me with the zone->lock already held. | 645 | * the smallest available page from the freelists |
634 | */ | 646 | */ |
635 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 647 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
648 | int migratetype) | ||
636 | { | 649 | { |
637 | struct free_area * area; | ||
638 | unsigned int current_order; | 650 | unsigned int current_order; |
651 | struct free_area * area; | ||
639 | struct page *page; | 652 | struct page *page; |
640 | 653 | ||
654 | /* Find a page of the appropriate size in the preferred list */ | ||
641 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 655 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
642 | area = zone->free_area + current_order; | 656 | area = &(zone->free_area[current_order]); |
643 | if (list_empty(&area->free_list)) | 657 | if (list_empty(&area->free_list[migratetype])) |
644 | continue; | 658 | continue; |
645 | 659 | ||
646 | page = list_entry(area->free_list.next, struct page, lru); | 660 | page = list_entry(area->free_list[migratetype].next, |
661 | struct page, lru); | ||
647 | list_del(&page->lru); | 662 | list_del(&page->lru); |
648 | rmv_page_order(page); | 663 | rmv_page_order(page); |
649 | area->nr_free--; | 664 | area->nr_free--; |
650 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | 665 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); |
651 | expand(zone, page, order, current_order, area); | 666 | expand(zone, page, order, current_order, area, migratetype); |
652 | return page; | 667 | return page; |
653 | } | 668 | } |
654 | 669 | ||
655 | return NULL; | 670 | return NULL; |
656 | } | 671 | } |
657 | 672 | ||
673 | |||
674 | /* | ||
675 | * This array describes the order lists are fallen back to when | ||
676 | * the free lists for the desirable migrate type are depleted | ||
677 | */ | ||
678 | static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | ||
679 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
680 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | ||
681 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
682 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */ | ||
683 | }; | ||
684 | |||
685 | /* | ||
686 | * Move the free pages in a range to the free lists of the requested type. | ||
687 | * Note that start_page and end_pages are not aligned on a pageblock | ||
688 | * boundary. If alignment is required, use move_freepages_block() | ||
689 | */ | ||
690 | int move_freepages(struct zone *zone, | ||
691 | struct page *start_page, struct page *end_page, | ||
692 | int migratetype) | ||
693 | { | ||
694 | struct page *page; | ||
695 | unsigned long order; | ||
696 | int pages_moved = 0; | ||
697 | |||
698 | #ifndef CONFIG_HOLES_IN_ZONE | ||
699 | /* | ||
700 | * page_zone is not safe to call in this context when | ||
701 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | ||
702 | * anyway as we check zone boundaries in move_freepages_block(). | ||
703 | * Remove at a later date when no bug reports exist related to | ||
704 | * grouping pages by mobility | ||
705 | */ | ||
706 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | ||
707 | #endif | ||
708 | |||
709 | for (page = start_page; page <= end_page;) { | ||
710 | if (!pfn_valid_within(page_to_pfn(page))) { | ||
711 | page++; | ||
712 | continue; | ||
713 | } | ||
714 | |||
715 | if (!PageBuddy(page)) { | ||
716 | page++; | ||
717 | continue; | ||
718 | } | ||
719 | |||
720 | order = page_order(page); | ||
721 | list_del(&page->lru); | ||
722 | list_add(&page->lru, | ||
723 | &zone->free_area[order].free_list[migratetype]); | ||
724 | page += 1 << order; | ||
725 | pages_moved += 1 << order; | ||
726 | } | ||
727 | |||
728 | return pages_moved; | ||
729 | } | ||
730 | |||
731 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | ||
732 | { | ||
733 | unsigned long start_pfn, end_pfn; | ||
734 | struct page *start_page, *end_page; | ||
735 | |||
736 | start_pfn = page_to_pfn(page); | ||
737 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | ||
738 | start_page = pfn_to_page(start_pfn); | ||
739 | end_page = start_page + pageblock_nr_pages - 1; | ||
740 | end_pfn = start_pfn + pageblock_nr_pages - 1; | ||
741 | |||
742 | /* Do not cross zone boundaries */ | ||
743 | if (start_pfn < zone->zone_start_pfn) | ||
744 | start_page = page; | ||
745 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | ||
746 | return 0; | ||
747 | |||
748 | return move_freepages(zone, start_page, end_page, migratetype); | ||
749 | } | ||
750 | |||
751 | /* Return the page with the lowest PFN in the list */ | ||
752 | static struct page *min_page(struct list_head *list) | ||
753 | { | ||
754 | unsigned long min_pfn = -1UL; | ||
755 | struct page *min_page = NULL, *page;; | ||
756 | |||
757 | list_for_each_entry(page, list, lru) { | ||
758 | unsigned long pfn = page_to_pfn(page); | ||
759 | if (pfn < min_pfn) { | ||
760 | min_pfn = pfn; | ||
761 | min_page = page; | ||
762 | } | ||
763 | } | ||
764 | |||
765 | return min_page; | ||
766 | } | ||
767 | |||
768 | /* Remove an element from the buddy allocator from the fallback list */ | ||
769 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | ||
770 | int start_migratetype) | ||
771 | { | ||
772 | struct free_area * area; | ||
773 | int current_order; | ||
774 | struct page *page; | ||
775 | int migratetype, i; | ||
776 | |||
777 | /* Find the largest possible block of pages in the other list */ | ||
778 | for (current_order = MAX_ORDER-1; current_order >= order; | ||
779 | --current_order) { | ||
780 | for (i = 0; i < MIGRATE_TYPES - 1; i++) { | ||
781 | migratetype = fallbacks[start_migratetype][i]; | ||
782 | |||
783 | /* MIGRATE_RESERVE handled later if necessary */ | ||
784 | if (migratetype == MIGRATE_RESERVE) | ||
785 | continue; | ||
786 | |||
787 | area = &(zone->free_area[current_order]); | ||
788 | if (list_empty(&area->free_list[migratetype])) | ||
789 | continue; | ||
790 | |||
791 | /* Bias kernel allocations towards low pfns */ | ||
792 | page = list_entry(area->free_list[migratetype].next, | ||
793 | struct page, lru); | ||
794 | if (unlikely(start_migratetype != MIGRATE_MOVABLE)) | ||
795 | page = min_page(&area->free_list[migratetype]); | ||
796 | area->nr_free--; | ||
797 | |||
798 | /* | ||
799 | * If breaking a large block of pages, move all free | ||
800 | * pages to the preferred allocation list. If falling | ||
801 | * back for a reclaimable kernel allocation, be more | ||
802 | * agressive about taking ownership of free pages | ||
803 | */ | ||
804 | if (unlikely(current_order >= (pageblock_order >> 1)) || | ||
805 | start_migratetype == MIGRATE_RECLAIMABLE) { | ||
806 | unsigned long pages; | ||
807 | pages = move_freepages_block(zone, page, | ||
808 | start_migratetype); | ||
809 | |||
810 | /* Claim the whole block if over half of it is free */ | ||
811 | if (pages >= (1 << (pageblock_order-1))) | ||
812 | set_pageblock_migratetype(page, | ||
813 | start_migratetype); | ||
814 | |||
815 | migratetype = start_migratetype; | ||
816 | } | ||
817 | |||
818 | /* Remove the page from the freelists */ | ||
819 | list_del(&page->lru); | ||
820 | rmv_page_order(page); | ||
821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
822 | -(1UL << order)); | ||
823 | |||
824 | if (current_order == pageblock_order) | ||
825 | set_pageblock_migratetype(page, | ||
826 | start_migratetype); | ||
827 | |||
828 | expand(zone, page, order, current_order, area, migratetype); | ||
829 | return page; | ||
830 | } | ||
831 | } | ||
832 | |||
833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | ||
834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
835 | } | ||
836 | |||
837 | /* | ||
838 | * Do the hard work of removing an element from the buddy allocator. | ||
839 | * Call me with the zone->lock already held. | ||
840 | */ | ||
841 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | ||
842 | int migratetype) | ||
843 | { | ||
844 | struct page *page; | ||
845 | |||
846 | page = __rmqueue_smallest(zone, order, migratetype); | ||
847 | |||
848 | if (unlikely(!page)) | ||
849 | page = __rmqueue_fallback(zone, order, migratetype); | ||
850 | |||
851 | return page; | ||
852 | } | ||
853 | |||
658 | /* | 854 | /* |
659 | * Obtain a specified number of elements from the buddy allocator, all under | 855 | * Obtain a specified number of elements from the buddy allocator, all under |
660 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 856 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
661 | * Returns the number of new pages which were placed at *list. | 857 | * Returns the number of new pages which were placed at *list. |
662 | */ | 858 | */ |
663 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 859 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
664 | unsigned long count, struct list_head *list) | 860 | unsigned long count, struct list_head *list, |
861 | int migratetype) | ||
665 | { | 862 | { |
666 | int i; | 863 | int i; |
667 | 864 | ||
668 | spin_lock(&zone->lock); | 865 | spin_lock(&zone->lock); |
669 | for (i = 0; i < count; ++i) { | 866 | for (i = 0; i < count; ++i) { |
670 | struct page *page = __rmqueue(zone, order); | 867 | struct page *page = __rmqueue(zone, order, migratetype); |
671 | if (unlikely(page == NULL)) | 868 | if (unlikely(page == NULL)) |
672 | break; | 869 | break; |
673 | list_add_tail(&page->lru, list); | 870 | list_add(&page->lru, list); |
871 | set_page_private(page, migratetype); | ||
674 | } | 872 | } |
675 | spin_unlock(&zone->lock); | 873 | spin_unlock(&zone->lock); |
676 | return i; | 874 | return i; |
@@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone) | |||
732 | { | 930 | { |
733 | unsigned long pfn, max_zone_pfn; | 931 | unsigned long pfn, max_zone_pfn; |
734 | unsigned long flags; | 932 | unsigned long flags; |
735 | int order; | 933 | int order, t; |
736 | struct list_head *curr; | 934 | struct list_head *curr; |
737 | 935 | ||
738 | if (!zone->spanned_pages) | 936 | if (!zone->spanned_pages) |
@@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone) | |||
749 | swsusp_unset_page_free(page); | 947 | swsusp_unset_page_free(page); |
750 | } | 948 | } |
751 | 949 | ||
752 | for (order = MAX_ORDER - 1; order >= 0; --order) | 950 | for_each_migratetype_order(order, t) { |
753 | list_for_each(curr, &zone->free_area[order].free_list) { | 951 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
754 | unsigned long i; | 952 | unsigned long i; |
755 | 953 | ||
756 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 954 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
757 | for (i = 0; i < (1UL << order); i++) | 955 | for (i = 0; i < (1UL << order); i++) |
758 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 956 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
759 | } | 957 | } |
760 | 958 | } | |
761 | spin_unlock_irqrestore(&zone->lock, flags); | 959 | spin_unlock_irqrestore(&zone->lock, flags); |
762 | } | 960 | } |
961 | #endif /* CONFIG_PM */ | ||
763 | 962 | ||
764 | /* | 963 | /* |
765 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 964 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
@@ -772,7 +971,25 @@ void drain_local_pages(void) | |||
772 | __drain_pages(smp_processor_id()); | 971 | __drain_pages(smp_processor_id()); |
773 | local_irq_restore(flags); | 972 | local_irq_restore(flags); |
774 | } | 973 | } |
775 | #endif /* CONFIG_HIBERNATION */ | 974 | |
975 | void smp_drain_local_pages(void *arg) | ||
976 | { | ||
977 | drain_local_pages(); | ||
978 | } | ||
979 | |||
980 | /* | ||
981 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator | ||
982 | */ | ||
983 | void drain_all_local_pages(void) | ||
984 | { | ||
985 | unsigned long flags; | ||
986 | |||
987 | local_irq_save(flags); | ||
988 | __drain_pages(smp_processor_id()); | ||
989 | local_irq_restore(flags); | ||
990 | |||
991 | smp_call_function(smp_drain_local_pages, NULL, 0, 1); | ||
992 | } | ||
776 | 993 | ||
777 | /* | 994 | /* |
778 | * Free a 0-order page | 995 | * Free a 0-order page |
@@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
797 | local_irq_save(flags); | 1014 | local_irq_save(flags); |
798 | __count_vm_event(PGFREE); | 1015 | __count_vm_event(PGFREE); |
799 | list_add(&page->lru, &pcp->list); | 1016 | list_add(&page->lru, &pcp->list); |
1017 | set_page_private(page, get_pageblock_migratetype(page)); | ||
800 | pcp->count++; | 1018 | pcp->count++; |
801 | if (pcp->count >= pcp->high) { | 1019 | if (pcp->count >= pcp->high) { |
802 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1020 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
@@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist, | |||
846 | struct page *page; | 1064 | struct page *page; |
847 | int cold = !!(gfp_flags & __GFP_COLD); | 1065 | int cold = !!(gfp_flags & __GFP_COLD); |
848 | int cpu; | 1066 | int cpu; |
1067 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
849 | 1068 | ||
850 | again: | 1069 | again: |
851 | cpu = get_cpu(); | 1070 | cpu = get_cpu(); |
@@ -856,16 +1075,28 @@ again: | |||
856 | local_irq_save(flags); | 1075 | local_irq_save(flags); |
857 | if (!pcp->count) { | 1076 | if (!pcp->count) { |
858 | pcp->count = rmqueue_bulk(zone, 0, | 1077 | pcp->count = rmqueue_bulk(zone, 0, |
859 | pcp->batch, &pcp->list); | 1078 | pcp->batch, &pcp->list, migratetype); |
860 | if (unlikely(!pcp->count)) | 1079 | if (unlikely(!pcp->count)) |
861 | goto failed; | 1080 | goto failed; |
862 | } | 1081 | } |
863 | page = list_entry(pcp->list.next, struct page, lru); | 1082 | |
1083 | /* Find a page of the appropriate migrate type */ | ||
1084 | list_for_each_entry(page, &pcp->list, lru) | ||
1085 | if (page_private(page) == migratetype) | ||
1086 | break; | ||
1087 | |||
1088 | /* Allocate more to the pcp list if necessary */ | ||
1089 | if (unlikely(&page->lru == &pcp->list)) { | ||
1090 | pcp->count += rmqueue_bulk(zone, 0, | ||
1091 | pcp->batch, &pcp->list, migratetype); | ||
1092 | page = list_entry(pcp->list.next, struct page, lru); | ||
1093 | } | ||
1094 | |||
864 | list_del(&page->lru); | 1095 | list_del(&page->lru); |
865 | pcp->count--; | 1096 | pcp->count--; |
866 | } else { | 1097 | } else { |
867 | spin_lock_irqsave(&zone->lock, flags); | 1098 | spin_lock_irqsave(&zone->lock, flags); |
868 | page = __rmqueue(zone, order); | 1099 | page = __rmqueue(zone, order, migratetype); |
869 | spin_unlock(&zone->lock); | 1100 | spin_unlock(&zone->lock); |
870 | if (!page) | 1101 | if (!page) |
871 | goto failed; | 1102 | goto failed; |
@@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1032 | * | 1263 | * |
1033 | * If the zonelist cache is present in the passed in zonelist, then | 1264 | * If the zonelist cache is present in the passed in zonelist, then |
1034 | * returns a pointer to the allowed node mask (either the current | 1265 | * returns a pointer to the allowed node mask (either the current |
1035 | * tasks mems_allowed, or node_online_map.) | 1266 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
1036 | * | 1267 | * |
1037 | * If the zonelist cache is not available for this zonelist, does | 1268 | * If the zonelist cache is not available for this zonelist, does |
1038 | * nothing and returns NULL. | 1269 | * nothing and returns NULL. |
@@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1061 | 1292 | ||
1062 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1293 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1063 | &cpuset_current_mems_allowed : | 1294 | &cpuset_current_mems_allowed : |
1064 | &node_online_map; | 1295 | &node_states[N_HIGH_MEMORY]; |
1065 | return allowednodes; | 1296 | return allowednodes; |
1066 | } | 1297 | } |
1067 | 1298 | ||
@@ -1183,9 +1414,6 @@ zonelist_scan: | |||
1183 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1414 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1184 | continue; | 1415 | continue; |
1185 | zone = *z; | 1416 | zone = *z; |
1186 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | ||
1187 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | ||
1188 | break; | ||
1189 | if ((alloc_flags & ALLOC_CPUSET) && | 1417 | if ((alloc_flags & ALLOC_CPUSET) && |
1190 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1191 | goto try_next_zone; | 1419 | goto try_next_zone; |
@@ -1254,7 +1482,10 @@ restart: | |||
1254 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1482 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
1255 | 1483 | ||
1256 | if (unlikely(*z == NULL)) { | 1484 | if (unlikely(*z == NULL)) { |
1257 | /* Should this ever happen?? */ | 1485 | /* |
1486 | * Happens if we have an empty zonelist as a result of | ||
1487 | * GFP_THISNODE being used on a memoryless node | ||
1488 | */ | ||
1258 | return NULL; | 1489 | return NULL; |
1259 | } | 1490 | } |
1260 | 1491 | ||
@@ -1346,6 +1577,9 @@ nofail_alloc: | |||
1346 | 1577 | ||
1347 | cond_resched(); | 1578 | cond_resched(); |
1348 | 1579 | ||
1580 | if (order != 0) | ||
1581 | drain_all_local_pages(); | ||
1582 | |||
1349 | if (likely(did_some_progress)) { | 1583 | if (likely(did_some_progress)) { |
1350 | page = get_page_from_freelist(gfp_mask, order, | 1584 | page = get_page_from_freelist(gfp_mask, order, |
1351 | zonelist, alloc_flags); | 1585 | zonelist, alloc_flags); |
@@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1794 | return node; | 2028 | return node; |
1795 | } | 2029 | } |
1796 | 2030 | ||
1797 | for_each_online_node(n) { | 2031 | for_each_node_state(n, N_HIGH_MEMORY) { |
1798 | cpumask_t tmp; | 2032 | cpumask_t tmp; |
1799 | 2033 | ||
1800 | /* Don't want a node to appear more than once */ | 2034 | /* Don't want a node to appear more than once */ |
@@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
1850 | } | 2084 | } |
1851 | 2085 | ||
1852 | /* | 2086 | /* |
2087 | * Build gfp_thisnode zonelists | ||
2088 | */ | ||
2089 | static void build_thisnode_zonelists(pg_data_t *pgdat) | ||
2090 | { | ||
2091 | enum zone_type i; | ||
2092 | int j; | ||
2093 | struct zonelist *zonelist; | ||
2094 | |||
2095 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2096 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | ||
2097 | j = build_zonelists_node(pgdat, zonelist, 0, i); | ||
2098 | zonelist->zones[j] = NULL; | ||
2099 | } | ||
2100 | } | ||
2101 | |||
2102 | /* | ||
1853 | * Build zonelists ordered by zone and nodes within zones. | 2103 | * Build zonelists ordered by zone and nodes within zones. |
1854 | * This results in conserving DMA zone[s] until all Normal memory is | 2104 | * This results in conserving DMA zone[s] until all Normal memory is |
1855 | * exhausted, but results in overflowing to remote node while memory | 2105 | * exhausted, but results in overflowing to remote node while memory |
@@ -1915,7 +2165,8 @@ static int default_zonelist_order(void) | |||
1915 | * If there is a node whose DMA/DMA32 memory is very big area on | 2165 | * If there is a node whose DMA/DMA32 memory is very big area on |
1916 | * local memory, NODE_ORDER may be suitable. | 2166 | * local memory, NODE_ORDER may be suitable. |
1917 | */ | 2167 | */ |
1918 | average_size = total_size / (num_online_nodes() + 1); | 2168 | average_size = total_size / |
2169 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | ||
1919 | for_each_online_node(nid) { | 2170 | for_each_online_node(nid) { |
1920 | low_kmem_size = 0; | 2171 | low_kmem_size = 0; |
1921 | total_size = 0; | 2172 | total_size = 0; |
@@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1953 | int order = current_zonelist_order; | 2204 | int order = current_zonelist_order; |
1954 | 2205 | ||
1955 | /* initialize zonelists */ | 2206 | /* initialize zonelists */ |
1956 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2207 | for (i = 0; i < MAX_ZONELISTS; i++) { |
1957 | zonelist = pgdat->node_zonelists + i; | 2208 | zonelist = pgdat->node_zonelists + i; |
1958 | zonelist->zones[0] = NULL; | 2209 | zonelist->zones[0] = NULL; |
1959 | } | 2210 | } |
@@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
1998 | /* calculate node order -- i.e., DMA last! */ | 2249 | /* calculate node order -- i.e., DMA last! */ |
1999 | build_zonelists_in_zone_order(pgdat, j); | 2250 | build_zonelists_in_zone_order(pgdat, j); |
2000 | } | 2251 | } |
2252 | |||
2253 | build_thisnode_zonelists(pgdat); | ||
2001 | } | 2254 | } |
2002 | 2255 | ||
2003 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2256 | /* Construct the zonelist performance cache - see further mmzone.h */ |
@@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy) | |||
2078 | int nid; | 2331 | int nid; |
2079 | 2332 | ||
2080 | for_each_online_node(nid) { | 2333 | for_each_online_node(nid) { |
2081 | build_zonelists(NODE_DATA(nid)); | 2334 | pg_data_t *pgdat = NODE_DATA(nid); |
2082 | build_zonelist_cache(NODE_DATA(nid)); | 2335 | |
2336 | build_zonelists(pgdat); | ||
2337 | build_zonelist_cache(pgdat); | ||
2083 | } | 2338 | } |
2084 | return 0; | 2339 | return 0; |
2085 | } | 2340 | } |
@@ -2098,9 +2353,23 @@ void build_all_zonelists(void) | |||
2098 | /* cpuset refresh routine should be here */ | 2353 | /* cpuset refresh routine should be here */ |
2099 | } | 2354 | } |
2100 | vm_total_pages = nr_free_pagecache_pages(); | 2355 | vm_total_pages = nr_free_pagecache_pages(); |
2101 | printk("Built %i zonelists in %s order. Total pages: %ld\n", | 2356 | /* |
2357 | * Disable grouping by mobility if the number of pages in the | ||
2358 | * system is too low to allow the mechanism to work. It would be | ||
2359 | * more accurate, but expensive to check per-zone. This check is | ||
2360 | * made on memory-hotadd so a system can start with mobility | ||
2361 | * disabled and enable it later | ||
2362 | */ | ||
2363 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | ||
2364 | page_group_by_mobility_disabled = 1; | ||
2365 | else | ||
2366 | page_group_by_mobility_disabled = 0; | ||
2367 | |||
2368 | printk("Built %i zonelists in %s order, mobility grouping %s. " | ||
2369 | "Total pages: %ld\n", | ||
2102 | num_online_nodes(), | 2370 | num_online_nodes(), |
2103 | zonelist_order_name[current_zonelist_order], | 2371 | zonelist_order_name[current_zonelist_order], |
2372 | page_group_by_mobility_disabled ? "off" : "on", | ||
2104 | vm_total_pages); | 2373 | vm_total_pages); |
2105 | #ifdef CONFIG_NUMA | 2374 | #ifdef CONFIG_NUMA |
2106 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 2375 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
@@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
2176 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 2445 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
2177 | 2446 | ||
2178 | /* | 2447 | /* |
2448 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | ||
2449 | * of blocks reserved is based on zone->pages_min. The memory within the | ||
2450 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | ||
2451 | * higher will lead to a bigger reserve which will get freed as contiguous | ||
2452 | * blocks as reclaim kicks in | ||
2453 | */ | ||
2454 | static void setup_zone_migrate_reserve(struct zone *zone) | ||
2455 | { | ||
2456 | unsigned long start_pfn, pfn, end_pfn; | ||
2457 | struct page *page; | ||
2458 | unsigned long reserve, block_migratetype; | ||
2459 | |||
2460 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | ||
2461 | start_pfn = zone->zone_start_pfn; | ||
2462 | end_pfn = start_pfn + zone->spanned_pages; | ||
2463 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | ||
2464 | pageblock_order; | ||
2465 | |||
2466 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
2467 | if (!pfn_valid(pfn)) | ||
2468 | continue; | ||
2469 | page = pfn_to_page(pfn); | ||
2470 | |||
2471 | /* Blocks with reserved pages will never free, skip them. */ | ||
2472 | if (PageReserved(page)) | ||
2473 | continue; | ||
2474 | |||
2475 | block_migratetype = get_pageblock_migratetype(page); | ||
2476 | |||
2477 | /* If this block is reserved, account for it */ | ||
2478 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | ||
2479 | reserve--; | ||
2480 | continue; | ||
2481 | } | ||
2482 | |||
2483 | /* Suitable for reserving if this block is movable */ | ||
2484 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | ||
2485 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | ||
2486 | move_freepages_block(zone, page, MIGRATE_RESERVE); | ||
2487 | reserve--; | ||
2488 | continue; | ||
2489 | } | ||
2490 | |||
2491 | /* | ||
2492 | * If the reserve is met and this is a previous reserved block, | ||
2493 | * take it back | ||
2494 | */ | ||
2495 | if (block_migratetype == MIGRATE_RESERVE) { | ||
2496 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2497 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
2498 | } | ||
2499 | } | ||
2500 | } | ||
2501 | |||
2502 | /* | ||
2179 | * Initially all pages are reserved - free ones are freed | 2503 | * Initially all pages are reserved - free ones are freed |
2180 | * up by free_all_bootmem() once the early boot process is | 2504 | * up by free_all_bootmem() once the early boot process is |
2181 | * done. Non-atomic initialization, single-pass. | 2505 | * done. Non-atomic initialization, single-pass. |
@@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2204 | init_page_count(page); | 2528 | init_page_count(page); |
2205 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
2206 | SetPageReserved(page); | 2530 | SetPageReserved(page); |
2531 | |||
2532 | /* | ||
2533 | * Mark the block movable so that blocks are reserved for | ||
2534 | * movable at startup. This will force kernel allocations | ||
2535 | * to reserve their blocks rather than leaking throughout | ||
2536 | * the address space during boot when many long-lived | ||
2537 | * kernel allocations are made. Later some blocks near | ||
2538 | * the start are marked MIGRATE_RESERVE by | ||
2539 | * setup_zone_migrate_reserve() | ||
2540 | */ | ||
2541 | if ((pfn & (pageblock_nr_pages-1))) | ||
2542 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
2543 | |||
2207 | INIT_LIST_HEAD(&page->lru); | 2544 | INIT_LIST_HEAD(&page->lru); |
2208 | #ifdef WANT_PAGE_VIRTUAL | 2545 | #ifdef WANT_PAGE_VIRTUAL |
2209 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 2546 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
@@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
2216 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, | 2553 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
2217 | struct zone *zone, unsigned long size) | 2554 | struct zone *zone, unsigned long size) |
2218 | { | 2555 | { |
2219 | int order; | 2556 | int order, t; |
2220 | for (order = 0; order < MAX_ORDER ; order++) { | 2557 | for_each_migratetype_order(order, t) { |
2221 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 2558 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
2222 | zone->free_area[order].nr_free = 0; | 2559 | zone->free_area[order].nr_free = 0; |
2223 | } | 2560 | } |
2224 | } | 2561 | } |
@@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS]; | |||
2324 | static int __cpuinit process_zones(int cpu) | 2661 | static int __cpuinit process_zones(int cpu) |
2325 | { | 2662 | { |
2326 | struct zone *zone, *dzone; | 2663 | struct zone *zone, *dzone; |
2664 | int node = cpu_to_node(cpu); | ||
2665 | |||
2666 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
2327 | 2667 | ||
2328 | for_each_zone(zone) { | 2668 | for_each_zone(zone) { |
2329 | 2669 | ||
@@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu) | |||
2331 | continue; | 2671 | continue; |
2332 | 2672 | ||
2333 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2673 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
2334 | GFP_KERNEL, cpu_to_node(cpu)); | 2674 | GFP_KERNEL, node); |
2335 | if (!zone_pcp(zone, cpu)) | 2675 | if (!zone_pcp(zone, cpu)) |
2336 | goto bad; | 2676 | goto bad; |
2337 | 2677 | ||
@@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
2444 | * To use this new node's memory, further consideration will be | 2784 | * To use this new node's memory, further consideration will be |
2445 | * necessary. | 2785 | * necessary. |
2446 | */ | 2786 | */ |
2447 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | 2787 | zone->wait_table = vmalloc(alloc_size); |
2448 | } | 2788 | } |
2449 | if (!zone->wait_table) | 2789 | if (!zone->wait_table) |
2450 | return -ENOMEM; | 2790 | return -ENOMEM; |
@@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
2680 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | 3020 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); |
2681 | } | 3021 | } |
2682 | 3022 | ||
2683 | if (*start_pfn == -1UL) { | 3023 | if (*start_pfn == -1UL) |
2684 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | ||
2685 | *start_pfn = 0; | 3024 | *start_pfn = 0; |
2686 | } | ||
2687 | 3025 | ||
2688 | /* Push the node boundaries out if requested */ | 3026 | /* Push the node boundaries out if requested */ |
2689 | account_node_boundary(nid, start_pfn, end_pfn); | 3027 | account_node_boundary(nid, start_pfn, end_pfn); |
@@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | |||
2901 | realtotalpages); | 3239 | realtotalpages); |
2902 | } | 3240 | } |
2903 | 3241 | ||
3242 | #ifndef CONFIG_SPARSEMEM | ||
3243 | /* | ||
3244 | * Calculate the size of the zone->blockflags rounded to an unsigned long | ||
3245 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | ||
3246 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | ||
3247 | * round what is now in bits to nearest long in bits, then return it in | ||
3248 | * bytes. | ||
3249 | */ | ||
3250 | static unsigned long __init usemap_size(unsigned long zonesize) | ||
3251 | { | ||
3252 | unsigned long usemapsize; | ||
3253 | |||
3254 | usemapsize = roundup(zonesize, pageblock_nr_pages); | ||
3255 | usemapsize = usemapsize >> pageblock_order; | ||
3256 | usemapsize *= NR_PAGEBLOCK_BITS; | ||
3257 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | ||
3258 | |||
3259 | return usemapsize / 8; | ||
3260 | } | ||
3261 | |||
3262 | static void __init setup_usemap(struct pglist_data *pgdat, | ||
3263 | struct zone *zone, unsigned long zonesize) | ||
3264 | { | ||
3265 | unsigned long usemapsize = usemap_size(zonesize); | ||
3266 | zone->pageblock_flags = NULL; | ||
3267 | if (usemapsize) { | ||
3268 | zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); | ||
3269 | memset(zone->pageblock_flags, 0, usemapsize); | ||
3270 | } | ||
3271 | } | ||
3272 | #else | ||
3273 | static void inline setup_usemap(struct pglist_data *pgdat, | ||
3274 | struct zone *zone, unsigned long zonesize) {} | ||
3275 | #endif /* CONFIG_SPARSEMEM */ | ||
3276 | |||
3277 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | ||
3278 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | ||
3279 | static inline void __init set_pageblock_order(unsigned int order) | ||
3280 | { | ||
3281 | /* Check that pageblock_nr_pages has not already been setup */ | ||
3282 | if (pageblock_order) | ||
3283 | return; | ||
3284 | |||
3285 | /* | ||
3286 | * Assume the largest contiguous order of interest is a huge page. | ||
3287 | * This value may be variable depending on boot parameters on IA64 | ||
3288 | */ | ||
3289 | pageblock_order = order; | ||
3290 | } | ||
3291 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3292 | |||
3293 | /* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */ | ||
3294 | #define set_pageblock_order(x) do {} while (0) | ||
3295 | |||
3296 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | ||
3297 | |||
2904 | /* | 3298 | /* |
2905 | * Set up the zone data structures: | 3299 | * Set up the zone data structures: |
2906 | * - mark all pages reserved | 3300 | * - mark all pages reserved |
@@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
2981 | if (!size) | 3375 | if (!size) |
2982 | continue; | 3376 | continue; |
2983 | 3377 | ||
3378 | set_pageblock_order(HUGETLB_PAGE_ORDER); | ||
3379 | setup_usemap(pgdat, zone, size); | ||
2984 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 3380 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2985 | size, MEMMAP_EARLY); | 3381 | size, MEMMAP_EARLY); |
2986 | BUG_ON(ret); | 3382 | BUG_ON(ret); |
@@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void) | |||
3234 | return max_pfn; | 3630 | return max_pfn; |
3235 | } | 3631 | } |
3236 | 3632 | ||
3237 | unsigned long __init early_calculate_totalpages(void) | 3633 | /* |
3634 | * early_calculate_totalpages() | ||
3635 | * Sum pages in active regions for movable zone. | ||
3636 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | ||
3637 | */ | ||
3638 | static unsigned long __init early_calculate_totalpages(void) | ||
3238 | { | 3639 | { |
3239 | int i; | 3640 | int i; |
3240 | unsigned long totalpages = 0; | 3641 | unsigned long totalpages = 0; |
3241 | 3642 | ||
3242 | for (i = 0; i < nr_nodemap_entries; i++) | 3643 | for (i = 0; i < nr_nodemap_entries; i++) { |
3243 | totalpages += early_node_map[i].end_pfn - | 3644 | unsigned long pages = early_node_map[i].end_pfn - |
3244 | early_node_map[i].start_pfn; | 3645 | early_node_map[i].start_pfn; |
3245 | 3646 | totalpages += pages; | |
3246 | return totalpages; | 3647 | if (pages) |
3648 | node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); | ||
3649 | } | ||
3650 | return totalpages; | ||
3247 | } | 3651 | } |
3248 | 3652 | ||
3249 | /* | 3653 | /* |
@@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3257 | int i, nid; | 3661 | int i, nid; |
3258 | unsigned long usable_startpfn; | 3662 | unsigned long usable_startpfn; |
3259 | unsigned long kernelcore_node, kernelcore_remaining; | 3663 | unsigned long kernelcore_node, kernelcore_remaining; |
3260 | int usable_nodes = num_online_nodes(); | 3664 | unsigned long totalpages = early_calculate_totalpages(); |
3665 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | ||
3261 | 3666 | ||
3262 | /* | 3667 | /* |
3263 | * If movablecore was specified, calculate what size of | 3668 | * If movablecore was specified, calculate what size of |
@@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3268 | * what movablecore would have allowed. | 3673 | * what movablecore would have allowed. |
3269 | */ | 3674 | */ |
3270 | if (required_movablecore) { | 3675 | if (required_movablecore) { |
3271 | unsigned long totalpages = early_calculate_totalpages(); | ||
3272 | unsigned long corepages; | 3676 | unsigned long corepages; |
3273 | 3677 | ||
3274 | /* | 3678 | /* |
@@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
3293 | restart: | 3697 | restart: |
3294 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 3698 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
3295 | kernelcore_node = required_kernelcore / usable_nodes; | 3699 | kernelcore_node = required_kernelcore / usable_nodes; |
3296 | for_each_online_node(nid) { | 3700 | for_each_node_state(nid, N_HIGH_MEMORY) { |
3297 | /* | 3701 | /* |
3298 | * Recalculate kernelcore_node if the division per node | 3702 | * Recalculate kernelcore_node if the division per node |
3299 | * now exceeds what is necessary to satisfy the requested | 3703 | * now exceeds what is necessary to satisfy the requested |
@@ -3385,6 +3789,20 @@ restart: | |||
3385 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 3789 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
3386 | } | 3790 | } |
3387 | 3791 | ||
3792 | /* Any regular memory on that node ? */ | ||
3793 | static void check_for_regular_memory(pg_data_t *pgdat) | ||
3794 | { | ||
3795 | #ifdef CONFIG_HIGHMEM | ||
3796 | enum zone_type zone_type; | ||
3797 | |||
3798 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | ||
3799 | struct zone *zone = &pgdat->node_zones[zone_type]; | ||
3800 | if (zone->present_pages) | ||
3801 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | ||
3802 | } | ||
3803 | #endif | ||
3804 | } | ||
3805 | |||
3388 | /** | 3806 | /** |
3389 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 3807 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
3390 | * @max_zone_pfn: an array of max PFNs for each zone | 3808 | * @max_zone_pfn: an array of max PFNs for each zone |
@@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
3459 | pg_data_t *pgdat = NODE_DATA(nid); | 3877 | pg_data_t *pgdat = NODE_DATA(nid); |
3460 | free_area_init_node(nid, pgdat, NULL, | 3878 | free_area_init_node(nid, pgdat, NULL, |
3461 | find_min_pfn_for_node(nid), NULL); | 3879 | find_min_pfn_for_node(nid), NULL); |
3880 | |||
3881 | /* Any memory on that node */ | ||
3882 | if (pgdat->node_present_pages) | ||
3883 | node_set_state(nid, N_HIGH_MEMORY); | ||
3884 | check_for_regular_memory(pgdat); | ||
3462 | } | 3885 | } |
3463 | } | 3886 | } |
3464 | 3887 | ||
@@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void) | |||
3673 | 4096 | ||
3674 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4097 | zone->pages_low = zone->pages_min + (tmp >> 2); |
3675 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4098 | zone->pages_high = zone->pages_min + (tmp >> 1); |
4099 | setup_zone_migrate_reserve(zone); | ||
3676 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 4100 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3677 | } | 4101 | } |
3678 | 4102 | ||
@@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page); | |||
3934 | EXPORT_SYMBOL(page_to_pfn); | 4358 | EXPORT_SYMBOL(page_to_pfn); |
3935 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 4359 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3936 | 4360 | ||
4361 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | ||
4362 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | ||
4363 | unsigned long pfn) | ||
4364 | { | ||
4365 | #ifdef CONFIG_SPARSEMEM | ||
4366 | return __pfn_to_section(pfn)->pageblock_flags; | ||
4367 | #else | ||
4368 | return zone->pageblock_flags; | ||
4369 | #endif /* CONFIG_SPARSEMEM */ | ||
4370 | } | ||
4371 | |||
4372 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | ||
4373 | { | ||
4374 | #ifdef CONFIG_SPARSEMEM | ||
4375 | pfn &= (PAGES_PER_SECTION-1); | ||
4376 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4377 | #else | ||
4378 | pfn = pfn - zone->zone_start_pfn; | ||
4379 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | ||
4380 | #endif /* CONFIG_SPARSEMEM */ | ||
4381 | } | ||
4382 | |||
4383 | /** | ||
4384 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | ||
4385 | * @page: The page within the block of interest | ||
4386 | * @start_bitidx: The first bit of interest to retrieve | ||
4387 | * @end_bitidx: The last bit of interest | ||
4388 | * returns pageblock_bits flags | ||
4389 | */ | ||
4390 | unsigned long get_pageblock_flags_group(struct page *page, | ||
4391 | int start_bitidx, int end_bitidx) | ||
4392 | { | ||
4393 | struct zone *zone; | ||
4394 | unsigned long *bitmap; | ||
4395 | unsigned long pfn, bitidx; | ||
4396 | unsigned long flags = 0; | ||
4397 | unsigned long value = 1; | ||
4398 | |||
4399 | zone = page_zone(page); | ||
4400 | pfn = page_to_pfn(page); | ||
4401 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4402 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4403 | |||
4404 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4405 | if (test_bit(bitidx + start_bitidx, bitmap)) | ||
4406 | flags |= value; | ||
4407 | |||
4408 | return flags; | ||
4409 | } | ||
3937 | 4410 | ||
4411 | /** | ||
4412 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | ||
4413 | * @page: The page within the block of interest | ||
4414 | * @start_bitidx: The first bit of interest | ||
4415 | * @end_bitidx: The last bit of interest | ||
4416 | * @flags: The flags to set | ||
4417 | */ | ||
4418 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | ||
4419 | int start_bitidx, int end_bitidx) | ||
4420 | { | ||
4421 | struct zone *zone; | ||
4422 | unsigned long *bitmap; | ||
4423 | unsigned long pfn, bitidx; | ||
4424 | unsigned long value = 1; | ||
4425 | |||
4426 | zone = page_zone(page); | ||
4427 | pfn = page_to_pfn(page); | ||
4428 | bitmap = get_pageblock_bitmap(zone, pfn); | ||
4429 | bitidx = pfn_to_bitidx(zone, pfn); | ||
4430 | |||
4431 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | ||
4432 | if (flags & value) | ||
4433 | __set_bit(bitidx + start_bitidx, bitmap); | ||
4434 | else | ||
4435 | __clear_bit(bitidx + start_bitidx, bitmap); | ||
4436 | } | ||
4437 | |||
4438 | /* | ||
4439 | * This is designed as sub function...plz see page_isolation.c also. | ||
4440 | * set/clear page block's type to be ISOLATE. | ||
4441 | * page allocater never alloc memory from ISOLATE block. | ||
4442 | */ | ||
4443 | |||
4444 | int set_migratetype_isolate(struct page *page) | ||
4445 | { | ||
4446 | struct zone *zone; | ||
4447 | unsigned long flags; | ||
4448 | int ret = -EBUSY; | ||
4449 | |||
4450 | zone = page_zone(page); | ||
4451 | spin_lock_irqsave(&zone->lock, flags); | ||
4452 | /* | ||
4453 | * In future, more migrate types will be able to be isolation target. | ||
4454 | */ | ||
4455 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | ||
4456 | goto out; | ||
4457 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
4458 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
4459 | ret = 0; | ||
4460 | out: | ||
4461 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4462 | if (!ret) | ||
4463 | drain_all_local_pages(); | ||
4464 | return ret; | ||
4465 | } | ||
4466 | |||
4467 | void unset_migratetype_isolate(struct page *page) | ||
4468 | { | ||
4469 | struct zone *zone; | ||
4470 | unsigned long flags; | ||
4471 | zone = page_zone(page); | ||
4472 | spin_lock_irqsave(&zone->lock, flags); | ||
4473 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
4474 | goto out; | ||
4475 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
4476 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | ||
4477 | out: | ||
4478 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4479 | } | ||
4480 | |||
4481 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
4482 | /* | ||
4483 | * All pages in the range must be isolated before calling this. | ||
4484 | */ | ||
4485 | void | ||
4486 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | ||
4487 | { | ||
4488 | struct page *page; | ||
4489 | struct zone *zone; | ||
4490 | int order, i; | ||
4491 | unsigned long pfn; | ||
4492 | unsigned long flags; | ||
4493 | /* find the first valid pfn */ | ||
4494 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | ||
4495 | if (pfn_valid(pfn)) | ||
4496 | break; | ||
4497 | if (pfn == end_pfn) | ||
4498 | return; | ||
4499 | zone = page_zone(pfn_to_page(pfn)); | ||
4500 | spin_lock_irqsave(&zone->lock, flags); | ||
4501 | pfn = start_pfn; | ||
4502 | while (pfn < end_pfn) { | ||
4503 | if (!pfn_valid(pfn)) { | ||
4504 | pfn++; | ||
4505 | continue; | ||
4506 | } | ||
4507 | page = pfn_to_page(pfn); | ||
4508 | BUG_ON(page_count(page)); | ||
4509 | BUG_ON(!PageBuddy(page)); | ||
4510 | order = page_order(page); | ||
4511 | #ifdef CONFIG_DEBUG_VM | ||
4512 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | ||
4513 | pfn, 1 << order, end_pfn); | ||
4514 | #endif | ||
4515 | list_del(&page->lru); | ||
4516 | rmv_page_order(page); | ||
4517 | zone->free_area[order].nr_free--; | ||
4518 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
4519 | - (1UL << order)); | ||
4520 | for (i = 0; i < (1 << order); i++) | ||
4521 | SetPageReserved((page+i)); | ||
4522 | pfn += (1 << order); | ||
4523 | } | ||
4524 | spin_unlock_irqrestore(&zone->lock, flags); | ||
4525 | } | ||
4526 | #endif | ||