aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c731
1 files changed, 660 insertions, 71 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c59571cb7..d315e1127dc9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -41,24 +41,37 @@
41#include <linux/pfn.h> 41#include <linux/pfn.h>
42#include <linux/backing-dev.h> 42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h> 43#include <linux/fault-inject.h>
44#include <linux/page-isolation.h>
44 45
45#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
46#include <asm/div64.h> 47#include <asm/div64.h>
47#include "internal.h" 48#include "internal.h"
48 49
49/* 50/*
50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 51 * Array of node states.
51 * initializer cleaner
52 */ 52 */
53nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 53nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
54EXPORT_SYMBOL(node_online_map); 54 [N_POSSIBLE] = NODE_MASK_ALL,
55nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 55 [N_ONLINE] = { { [0] = 1UL } },
56EXPORT_SYMBOL(node_possible_map); 56#ifndef CONFIG_NUMA
57 [N_NORMAL_MEMORY] = { { [0] = 1UL } },
58#ifdef CONFIG_HIGHMEM
59 [N_HIGH_MEMORY] = { { [0] = 1UL } },
60#endif
61 [N_CPU] = { { [0] = 1UL } },
62#endif /* NUMA */
63};
64EXPORT_SYMBOL(node_states);
65
57unsigned long totalram_pages __read_mostly; 66unsigned long totalram_pages __read_mostly;
58unsigned long totalreserve_pages __read_mostly; 67unsigned long totalreserve_pages __read_mostly;
59long nr_swap_pages; 68long nr_swap_pages;
60int percpu_pagelist_fraction; 69int percpu_pagelist_fraction;
61 70
71#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
72int pageblock_order __read_mostly;
73#endif
74
62static void __free_pages_ok(struct page *page, unsigned int order); 75static void __free_pages_ok(struct page *page, unsigned int order);
63 76
64/* 77/*
@@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve;
137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 150 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
138#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 151#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
139 unsigned long __initdata required_kernelcore; 152 unsigned long __initdata required_kernelcore;
140 unsigned long __initdata required_movablecore; 153 static unsigned long __initdata required_movablecore;
141 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 154 unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
142 155
143 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 156 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES;
150EXPORT_SYMBOL(nr_node_ids); 163EXPORT_SYMBOL(nr_node_ids);
151#endif 164#endif
152 165
166int page_group_by_mobility_disabled __read_mostly;
167
168static void set_pageblock_migratetype(struct page *page, int migratetype)
169{
170 set_pageblock_flags_group(page, (unsigned long)migratetype,
171 PB_migrate, PB_migrate_end);
172}
173
153#ifdef CONFIG_DEBUG_VM 174#ifdef CONFIG_DEBUG_VM
154static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 175static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
155{ 176{
@@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
293 clear_highpage(page + i); 314 clear_highpage(page + i);
294} 315}
295 316
296/*
297 * function for dealing with page's order in buddy system.
298 * zone->lock is already acquired when we use these.
299 * So, we don't need atomic page->flags operations here.
300 */
301static inline unsigned long page_order(struct page *page)
302{
303 return page_private(page);
304}
305
306static inline void set_page_order(struct page *page, int order) 317static inline void set_page_order(struct page *page, int order)
307{ 318{
308 set_page_private(page, order); 319 set_page_private(page, order);
@@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page,
404{ 415{
405 unsigned long page_idx; 416 unsigned long page_idx;
406 int order_size = 1 << order; 417 int order_size = 1 << order;
418 int migratetype = get_pageblock_migratetype(page);
407 419
408 if (unlikely(PageCompound(page))) 420 if (unlikely(PageCompound(page)))
409 destroy_compound_page(page, order); 421 destroy_compound_page(page, order);
@@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page,
416 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 428 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
417 while (order < MAX_ORDER-1) { 429 while (order < MAX_ORDER-1) {
418 unsigned long combined_idx; 430 unsigned long combined_idx;
419 struct free_area *area;
420 struct page *buddy; 431 struct page *buddy;
421 432
422 buddy = __page_find_buddy(page, page_idx, order); 433 buddy = __page_find_buddy(page, page_idx, order);
@@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page,
424 break; /* Move the buddy up one level. */ 435 break; /* Move the buddy up one level. */
425 436
426 list_del(&buddy->lru); 437 list_del(&buddy->lru);
427 area = zone->free_area + order; 438 zone->free_area[order].nr_free--;
428 area->nr_free--;
429 rmv_page_order(buddy); 439 rmv_page_order(buddy);
430 combined_idx = __find_combined_index(page_idx, order); 440 combined_idx = __find_combined_index(page_idx, order);
431 page = page + (combined_idx - page_idx); 441 page = page + (combined_idx - page_idx);
@@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page,
433 order++; 443 order++;
434 } 444 }
435 set_page_order(page, order); 445 set_page_order(page, order);
436 list_add(&page->lru, &zone->free_area[order].free_list); 446 list_add(&page->lru,
447 &zone->free_area[order].free_list[migratetype]);
437 zone->free_area[order].nr_free++; 448 zone->free_area[order].nr_free++;
438} 449}
439 450
@@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
567 * -- wli 578 * -- wli
568 */ 579 */
569static inline void expand(struct zone *zone, struct page *page, 580static inline void expand(struct zone *zone, struct page *page,
570 int low, int high, struct free_area *area) 581 int low, int high, struct free_area *area,
582 int migratetype)
571{ 583{
572 unsigned long size = 1 << high; 584 unsigned long size = 1 << high;
573 585
@@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page,
576 high--; 588 high--;
577 size >>= 1; 589 size >>= 1;
578 VM_BUG_ON(bad_range(zone, &page[size])); 590 VM_BUG_ON(bad_range(zone, &page[size]));
579 list_add(&page[size].lru, &area->free_list); 591 list_add(&page[size].lru, &area->free_list[migratetype]);
580 area->nr_free++; 592 area->nr_free++;
581 set_page_order(&page[size], high); 593 set_page_order(&page[size], high);
582 } 594 }
@@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
628 return 0; 640 return 0;
629} 641}
630 642
631/* 643/*
632 * Do the hard work of removing an element from the buddy allocator. 644 * Go through the free lists for the given migratetype and remove
633 * Call me with the zone->lock already held. 645 * the smallest available page from the freelists
634 */ 646 */
635static struct page *__rmqueue(struct zone *zone, unsigned int order) 647static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
648 int migratetype)
636{ 649{
637 struct free_area * area;
638 unsigned int current_order; 650 unsigned int current_order;
651 struct free_area * area;
639 struct page *page; 652 struct page *page;
640 653
654 /* Find a page of the appropriate size in the preferred list */
641 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 655 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
642 area = zone->free_area + current_order; 656 area = &(zone->free_area[current_order]);
643 if (list_empty(&area->free_list)) 657 if (list_empty(&area->free_list[migratetype]))
644 continue; 658 continue;
645 659
646 page = list_entry(area->free_list.next, struct page, lru); 660 page = list_entry(area->free_list[migratetype].next,
661 struct page, lru);
647 list_del(&page->lru); 662 list_del(&page->lru);
648 rmv_page_order(page); 663 rmv_page_order(page);
649 area->nr_free--; 664 area->nr_free--;
650 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 665 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
651 expand(zone, page, order, current_order, area); 666 expand(zone, page, order, current_order, area, migratetype);
652 return page; 667 return page;
653 } 668 }
654 669
655 return NULL; 670 return NULL;
656} 671}
657 672
673
674/*
675 * This array describes the order lists are fallen back to when
676 * the free lists for the desirable migrate type are depleted
677 */
678static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
679 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
680 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
681 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
682 [MIGRATE_RESERVE] = { MIGRATE_RESERVE, MIGRATE_RESERVE, MIGRATE_RESERVE }, /* Never used */
683};
684
685/*
686 * Move the free pages in a range to the free lists of the requested type.
687 * Note that start_page and end_pages are not aligned on a pageblock
688 * boundary. If alignment is required, use move_freepages_block()
689 */
690int move_freepages(struct zone *zone,
691 struct page *start_page, struct page *end_page,
692 int migratetype)
693{
694 struct page *page;
695 unsigned long order;
696 int pages_moved = 0;
697
698#ifndef CONFIG_HOLES_IN_ZONE
699 /*
700 * page_zone is not safe to call in this context when
701 * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
702 * anyway as we check zone boundaries in move_freepages_block().
703 * Remove at a later date when no bug reports exist related to
704 * grouping pages by mobility
705 */
706 BUG_ON(page_zone(start_page) != page_zone(end_page));
707#endif
708
709 for (page = start_page; page <= end_page;) {
710 if (!pfn_valid_within(page_to_pfn(page))) {
711 page++;
712 continue;
713 }
714
715 if (!PageBuddy(page)) {
716 page++;
717 continue;
718 }
719
720 order = page_order(page);
721 list_del(&page->lru);
722 list_add(&page->lru,
723 &zone->free_area[order].free_list[migratetype]);
724 page += 1 << order;
725 pages_moved += 1 << order;
726 }
727
728 return pages_moved;
729}
730
731int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
732{
733 unsigned long start_pfn, end_pfn;
734 struct page *start_page, *end_page;
735
736 start_pfn = page_to_pfn(page);
737 start_pfn = start_pfn & ~(pageblock_nr_pages-1);
738 start_page = pfn_to_page(start_pfn);
739 end_page = start_page + pageblock_nr_pages - 1;
740 end_pfn = start_pfn + pageblock_nr_pages - 1;
741
742 /* Do not cross zone boundaries */
743 if (start_pfn < zone->zone_start_pfn)
744 start_page = page;
745 if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
746 return 0;
747
748 return move_freepages(zone, start_page, end_page, migratetype);
749}
750
751/* Return the page with the lowest PFN in the list */
752static struct page *min_page(struct list_head *list)
753{
754 unsigned long min_pfn = -1UL;
755 struct page *min_page = NULL, *page;;
756
757 list_for_each_entry(page, list, lru) {
758 unsigned long pfn = page_to_pfn(page);
759 if (pfn < min_pfn) {
760 min_pfn = pfn;
761 min_page = page;
762 }
763 }
764
765 return min_page;
766}
767
768/* Remove an element from the buddy allocator from the fallback list */
769static struct page *__rmqueue_fallback(struct zone *zone, int order,
770 int start_migratetype)
771{
772 struct free_area * area;
773 int current_order;
774 struct page *page;
775 int migratetype, i;
776
777 /* Find the largest possible block of pages in the other list */
778 for (current_order = MAX_ORDER-1; current_order >= order;
779 --current_order) {
780 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
781 migratetype = fallbacks[start_migratetype][i];
782
783 /* MIGRATE_RESERVE handled later if necessary */
784 if (migratetype == MIGRATE_RESERVE)
785 continue;
786
787 area = &(zone->free_area[current_order]);
788 if (list_empty(&area->free_list[migratetype]))
789 continue;
790
791 /* Bias kernel allocations towards low pfns */
792 page = list_entry(area->free_list[migratetype].next,
793 struct page, lru);
794 if (unlikely(start_migratetype != MIGRATE_MOVABLE))
795 page = min_page(&area->free_list[migratetype]);
796 area->nr_free--;
797
798 /*
799 * If breaking a large block of pages, move all free
800 * pages to the preferred allocation list. If falling
801 * back for a reclaimable kernel allocation, be more
802 * agressive about taking ownership of free pages
803 */
804 if (unlikely(current_order >= (pageblock_order >> 1)) ||
805 start_migratetype == MIGRATE_RECLAIMABLE) {
806 unsigned long pages;
807 pages = move_freepages_block(zone, page,
808 start_migratetype);
809
810 /* Claim the whole block if over half of it is free */
811 if (pages >= (1 << (pageblock_order-1)))
812 set_pageblock_migratetype(page,
813 start_migratetype);
814
815 migratetype = start_migratetype;
816 }
817
818 /* Remove the page from the freelists */
819 list_del(&page->lru);
820 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823
824 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page,
826 start_migratetype);
827
828 expand(zone, page, order, current_order, area, migratetype);
829 return page;
830 }
831 }
832
833 /* Use MIGRATE_RESERVE rather than fail an allocation */
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835}
836
837/*
838 * Do the hard work of removing an element from the buddy allocator.
839 * Call me with the zone->lock already held.
840 */
841static struct page *__rmqueue(struct zone *zone, unsigned int order,
842 int migratetype)
843{
844 struct page *page;
845
846 page = __rmqueue_smallest(zone, order, migratetype);
847
848 if (unlikely(!page))
849 page = __rmqueue_fallback(zone, order, migratetype);
850
851 return page;
852}
853
658/* 854/*
659 * Obtain a specified number of elements from the buddy allocator, all under 855 * Obtain a specified number of elements from the buddy allocator, all under
660 * a single hold of the lock, for efficiency. Add them to the supplied list. 856 * a single hold of the lock, for efficiency. Add them to the supplied list.
661 * Returns the number of new pages which were placed at *list. 857 * Returns the number of new pages which were placed at *list.
662 */ 858 */
663static int rmqueue_bulk(struct zone *zone, unsigned int order, 859static int rmqueue_bulk(struct zone *zone, unsigned int order,
664 unsigned long count, struct list_head *list) 860 unsigned long count, struct list_head *list,
861 int migratetype)
665{ 862{
666 int i; 863 int i;
667 864
668 spin_lock(&zone->lock); 865 spin_lock(&zone->lock);
669 for (i = 0; i < count; ++i) { 866 for (i = 0; i < count; ++i) {
670 struct page *page = __rmqueue(zone, order); 867 struct page *page = __rmqueue(zone, order, migratetype);
671 if (unlikely(page == NULL)) 868 if (unlikely(page == NULL))
672 break; 869 break;
673 list_add_tail(&page->lru, list); 870 list_add(&page->lru, list);
871 set_page_private(page, migratetype);
674 } 872 }
675 spin_unlock(&zone->lock); 873 spin_unlock(&zone->lock);
676 return i; 874 return i;
@@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone)
732{ 930{
733 unsigned long pfn, max_zone_pfn; 931 unsigned long pfn, max_zone_pfn;
734 unsigned long flags; 932 unsigned long flags;
735 int order; 933 int order, t;
736 struct list_head *curr; 934 struct list_head *curr;
737 935
738 if (!zone->spanned_pages) 936 if (!zone->spanned_pages)
@@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone)
749 swsusp_unset_page_free(page); 947 swsusp_unset_page_free(page);
750 } 948 }
751 949
752 for (order = MAX_ORDER - 1; order >= 0; --order) 950 for_each_migratetype_order(order, t) {
753 list_for_each(curr, &zone->free_area[order].free_list) { 951 list_for_each(curr, &zone->free_area[order].free_list[t]) {
754 unsigned long i; 952 unsigned long i;
755 953
756 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 954 pfn = page_to_pfn(list_entry(curr, struct page, lru));
757 for (i = 0; i < (1UL << order); i++) 955 for (i = 0; i < (1UL << order); i++)
758 swsusp_set_page_free(pfn_to_page(pfn + i)); 956 swsusp_set_page_free(pfn_to_page(pfn + i));
759 } 957 }
760 958 }
761 spin_unlock_irqrestore(&zone->lock, flags); 959 spin_unlock_irqrestore(&zone->lock, flags);
762} 960}
961#endif /* CONFIG_PM */
763 962
764/* 963/*
765 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 964 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
@@ -772,7 +971,25 @@ void drain_local_pages(void)
772 __drain_pages(smp_processor_id()); 971 __drain_pages(smp_processor_id());
773 local_irq_restore(flags); 972 local_irq_restore(flags);
774} 973}
775#endif /* CONFIG_HIBERNATION */ 974
975void smp_drain_local_pages(void *arg)
976{
977 drain_local_pages();
978}
979
980/*
981 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
982 */
983void drain_all_local_pages(void)
984{
985 unsigned long flags;
986
987 local_irq_save(flags);
988 __drain_pages(smp_processor_id());
989 local_irq_restore(flags);
990
991 smp_call_function(smp_drain_local_pages, NULL, 0, 1);
992}
776 993
777/* 994/*
778 * Free a 0-order page 995 * Free a 0-order page
@@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
797 local_irq_save(flags); 1014 local_irq_save(flags);
798 __count_vm_event(PGFREE); 1015 __count_vm_event(PGFREE);
799 list_add(&page->lru, &pcp->list); 1016 list_add(&page->lru, &pcp->list);
1017 set_page_private(page, get_pageblock_migratetype(page));
800 pcp->count++; 1018 pcp->count++;
801 if (pcp->count >= pcp->high) { 1019 if (pcp->count >= pcp->high) {
802 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1020 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
846 struct page *page; 1064 struct page *page;
847 int cold = !!(gfp_flags & __GFP_COLD); 1065 int cold = !!(gfp_flags & __GFP_COLD);
848 int cpu; 1066 int cpu;
1067 int migratetype = allocflags_to_migratetype(gfp_flags);
849 1068
850again: 1069again:
851 cpu = get_cpu(); 1070 cpu = get_cpu();
@@ -856,16 +1075,28 @@ again:
856 local_irq_save(flags); 1075 local_irq_save(flags);
857 if (!pcp->count) { 1076 if (!pcp->count) {
858 pcp->count = rmqueue_bulk(zone, 0, 1077 pcp->count = rmqueue_bulk(zone, 0,
859 pcp->batch, &pcp->list); 1078 pcp->batch, &pcp->list, migratetype);
860 if (unlikely(!pcp->count)) 1079 if (unlikely(!pcp->count))
861 goto failed; 1080 goto failed;
862 } 1081 }
863 page = list_entry(pcp->list.next, struct page, lru); 1082
1083 /* Find a page of the appropriate migrate type */
1084 list_for_each_entry(page, &pcp->list, lru)
1085 if (page_private(page) == migratetype)
1086 break;
1087
1088 /* Allocate more to the pcp list if necessary */
1089 if (unlikely(&page->lru == &pcp->list)) {
1090 pcp->count += rmqueue_bulk(zone, 0,
1091 pcp->batch, &pcp->list, migratetype);
1092 page = list_entry(pcp->list.next, struct page, lru);
1093 }
1094
864 list_del(&page->lru); 1095 list_del(&page->lru);
865 pcp->count--; 1096 pcp->count--;
866 } else { 1097 } else {
867 spin_lock_irqsave(&zone->lock, flags); 1098 spin_lock_irqsave(&zone->lock, flags);
868 page = __rmqueue(zone, order); 1099 page = __rmqueue(zone, order, migratetype);
869 spin_unlock(&zone->lock); 1100 spin_unlock(&zone->lock);
870 if (!page) 1101 if (!page)
871 goto failed; 1102 goto failed;
@@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1032 * 1263 *
1033 * If the zonelist cache is present in the passed in zonelist, then 1264 * If the zonelist cache is present in the passed in zonelist, then
1034 * returns a pointer to the allowed node mask (either the current 1265 * returns a pointer to the allowed node mask (either the current
1035 * tasks mems_allowed, or node_online_map.) 1266 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
1036 * 1267 *
1037 * If the zonelist cache is not available for this zonelist, does 1268 * If the zonelist cache is not available for this zonelist, does
1038 * nothing and returns NULL. 1269 * nothing and returns NULL.
@@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1061 1292
1062 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1293 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1063 &cpuset_current_mems_allowed : 1294 &cpuset_current_mems_allowed :
1064 &node_online_map; 1295 &node_states[N_HIGH_MEMORY];
1065 return allowednodes; 1296 return allowednodes;
1066} 1297}
1067 1298
@@ -1183,9 +1414,6 @@ zonelist_scan:
1183 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1414 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1184 continue; 1415 continue;
1185 zone = *z; 1416 zone = *z;
1186 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1187 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1188 break;
1189 if ((alloc_flags & ALLOC_CPUSET) && 1417 if ((alloc_flags & ALLOC_CPUSET) &&
1190 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1418 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1191 goto try_next_zone; 1419 goto try_next_zone;
@@ -1254,7 +1482,10 @@ restart:
1254 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1482 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1255 1483
1256 if (unlikely(*z == NULL)) { 1484 if (unlikely(*z == NULL)) {
1257 /* Should this ever happen?? */ 1485 /*
1486 * Happens if we have an empty zonelist as a result of
1487 * GFP_THISNODE being used on a memoryless node
1488 */
1258 return NULL; 1489 return NULL;
1259 } 1490 }
1260 1491
@@ -1346,6 +1577,9 @@ nofail_alloc:
1346 1577
1347 cond_resched(); 1578 cond_resched();
1348 1579
1580 if (order != 0)
1581 drain_all_local_pages();
1582
1349 if (likely(did_some_progress)) { 1583 if (likely(did_some_progress)) {
1350 page = get_page_from_freelist(gfp_mask, order, 1584 page = get_page_from_freelist(gfp_mask, order,
1351 zonelist, alloc_flags); 1585 zonelist, alloc_flags);
@@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
1794 return node; 2028 return node;
1795 } 2029 }
1796 2030
1797 for_each_online_node(n) { 2031 for_each_node_state(n, N_HIGH_MEMORY) {
1798 cpumask_t tmp; 2032 cpumask_t tmp;
1799 2033
1800 /* Don't want a node to appear more than once */ 2034 /* Don't want a node to appear more than once */
@@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1850} 2084}
1851 2085
1852/* 2086/*
2087 * Build gfp_thisnode zonelists
2088 */
2089static void build_thisnode_zonelists(pg_data_t *pgdat)
2090{
2091 enum zone_type i;
2092 int j;
2093 struct zonelist *zonelist;
2094
2095 for (i = 0; i < MAX_NR_ZONES; i++) {
2096 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
2097 j = build_zonelists_node(pgdat, zonelist, 0, i);
2098 zonelist->zones[j] = NULL;
2099 }
2100}
2101
2102/*
1853 * Build zonelists ordered by zone and nodes within zones. 2103 * Build zonelists ordered by zone and nodes within zones.
1854 * This results in conserving DMA zone[s] until all Normal memory is 2104 * This results in conserving DMA zone[s] until all Normal memory is
1855 * exhausted, but results in overflowing to remote node while memory 2105 * exhausted, but results in overflowing to remote node while memory
@@ -1915,7 +2165,8 @@ static int default_zonelist_order(void)
1915 * If there is a node whose DMA/DMA32 memory is very big area on 2165 * If there is a node whose DMA/DMA32 memory is very big area on
1916 * local memory, NODE_ORDER may be suitable. 2166 * local memory, NODE_ORDER may be suitable.
1917 */ 2167 */
1918 average_size = total_size / (num_online_nodes() + 1); 2168 average_size = total_size /
2169 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
1919 for_each_online_node(nid) { 2170 for_each_online_node(nid) {
1920 low_kmem_size = 0; 2171 low_kmem_size = 0;
1921 total_size = 0; 2172 total_size = 0;
@@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat)
1953 int order = current_zonelist_order; 2204 int order = current_zonelist_order;
1954 2205
1955 /* initialize zonelists */ 2206 /* initialize zonelists */
1956 for (i = 0; i < MAX_NR_ZONES; i++) { 2207 for (i = 0; i < MAX_ZONELISTS; i++) {
1957 zonelist = pgdat->node_zonelists + i; 2208 zonelist = pgdat->node_zonelists + i;
1958 zonelist->zones[0] = NULL; 2209 zonelist->zones[0] = NULL;
1959 } 2210 }
@@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat)
1998 /* calculate node order -- i.e., DMA last! */ 2249 /* calculate node order -- i.e., DMA last! */
1999 build_zonelists_in_zone_order(pgdat, j); 2250 build_zonelists_in_zone_order(pgdat, j);
2000 } 2251 }
2252
2253 build_thisnode_zonelists(pgdat);
2001} 2254}
2002 2255
2003/* Construct the zonelist performance cache - see further mmzone.h */ 2256/* Construct the zonelist performance cache - see further mmzone.h */
@@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy)
2078 int nid; 2331 int nid;
2079 2332
2080 for_each_online_node(nid) { 2333 for_each_online_node(nid) {
2081 build_zonelists(NODE_DATA(nid)); 2334 pg_data_t *pgdat = NODE_DATA(nid);
2082 build_zonelist_cache(NODE_DATA(nid)); 2335
2336 build_zonelists(pgdat);
2337 build_zonelist_cache(pgdat);
2083 } 2338 }
2084 return 0; 2339 return 0;
2085} 2340}
@@ -2098,9 +2353,23 @@ void build_all_zonelists(void)
2098 /* cpuset refresh routine should be here */ 2353 /* cpuset refresh routine should be here */
2099 } 2354 }
2100 vm_total_pages = nr_free_pagecache_pages(); 2355 vm_total_pages = nr_free_pagecache_pages();
2101 printk("Built %i zonelists in %s order. Total pages: %ld\n", 2356 /*
2357 * Disable grouping by mobility if the number of pages in the
2358 * system is too low to allow the mechanism to work. It would be
2359 * more accurate, but expensive to check per-zone. This check is
2360 * made on memory-hotadd so a system can start with mobility
2361 * disabled and enable it later
2362 */
2363 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2364 page_group_by_mobility_disabled = 1;
2365 else
2366 page_group_by_mobility_disabled = 0;
2367
2368 printk("Built %i zonelists in %s order, mobility grouping %s. "
2369 "Total pages: %ld\n",
2102 num_online_nodes(), 2370 num_online_nodes(),
2103 zonelist_order_name[current_zonelist_order], 2371 zonelist_order_name[current_zonelist_order],
2372 page_group_by_mobility_disabled ? "off" : "on",
2104 vm_total_pages); 2373 vm_total_pages);
2105#ifdef CONFIG_NUMA 2374#ifdef CONFIG_NUMA
2106 printk("Policy zone: %s\n", zone_names[policy_zone]); 2375 printk("Policy zone: %s\n", zone_names[policy_zone]);
@@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size)
2176#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 2445#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
2177 2446
2178/* 2447/*
2448 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2449 * of blocks reserved is based on zone->pages_min. The memory within the
2450 * reserve will tend to store contiguous free pages. Setting min_free_kbytes
2451 * higher will lead to a bigger reserve which will get freed as contiguous
2452 * blocks as reclaim kicks in
2453 */
2454static void setup_zone_migrate_reserve(struct zone *zone)
2455{
2456 unsigned long start_pfn, pfn, end_pfn;
2457 struct page *page;
2458 unsigned long reserve, block_migratetype;
2459
2460 /* Get the start pfn, end pfn and the number of blocks to reserve */
2461 start_pfn = zone->zone_start_pfn;
2462 end_pfn = start_pfn + zone->spanned_pages;
2463 reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
2464 pageblock_order;
2465
2466 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
2467 if (!pfn_valid(pfn))
2468 continue;
2469 page = pfn_to_page(pfn);
2470
2471 /* Blocks with reserved pages will never free, skip them. */
2472 if (PageReserved(page))
2473 continue;
2474
2475 block_migratetype = get_pageblock_migratetype(page);
2476
2477 /* If this block is reserved, account for it */
2478 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
2479 reserve--;
2480 continue;
2481 }
2482
2483 /* Suitable for reserving if this block is movable */
2484 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
2485 set_pageblock_migratetype(page, MIGRATE_RESERVE);
2486 move_freepages_block(zone, page, MIGRATE_RESERVE);
2487 reserve--;
2488 continue;
2489 }
2490
2491 /*
2492 * If the reserve is met and this is a previous reserved block,
2493 * take it back
2494 */
2495 if (block_migratetype == MIGRATE_RESERVE) {
2496 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2497 move_freepages_block(zone, page, MIGRATE_MOVABLE);
2498 }
2499 }
2500}
2501
2502/*
2179 * Initially all pages are reserved - free ones are freed 2503 * Initially all pages are reserved - free ones are freed
2180 * up by free_all_bootmem() once the early boot process is 2504 * up by free_all_bootmem() once the early boot process is
2181 * done. Non-atomic initialization, single-pass. 2505 * done. Non-atomic initialization, single-pass.
@@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2204 init_page_count(page); 2528 init_page_count(page);
2205 reset_page_mapcount(page); 2529 reset_page_mapcount(page);
2206 SetPageReserved(page); 2530 SetPageReserved(page);
2531
2532 /*
2533 * Mark the block movable so that blocks are reserved for
2534 * movable at startup. This will force kernel allocations
2535 * to reserve their blocks rather than leaking throughout
2536 * the address space during boot when many long-lived
2537 * kernel allocations are made. Later some blocks near
2538 * the start are marked MIGRATE_RESERVE by
2539 * setup_zone_migrate_reserve()
2540 */
2541 if ((pfn & (pageblock_nr_pages-1)))
2542 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
2543
2207 INIT_LIST_HEAD(&page->lru); 2544 INIT_LIST_HEAD(&page->lru);
2208#ifdef WANT_PAGE_VIRTUAL 2545#ifdef WANT_PAGE_VIRTUAL
2209 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 2546 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2216static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2553static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
2217 struct zone *zone, unsigned long size) 2554 struct zone *zone, unsigned long size)
2218{ 2555{
2219 int order; 2556 int order, t;
2220 for (order = 0; order < MAX_ORDER ; order++) { 2557 for_each_migratetype_order(order, t) {
2221 INIT_LIST_HEAD(&zone->free_area[order].free_list); 2558 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
2222 zone->free_area[order].nr_free = 0; 2559 zone->free_area[order].nr_free = 0;
2223 } 2560 }
2224} 2561}
@@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS];
2324static int __cpuinit process_zones(int cpu) 2661static int __cpuinit process_zones(int cpu)
2325{ 2662{
2326 struct zone *zone, *dzone; 2663 struct zone *zone, *dzone;
2664 int node = cpu_to_node(cpu);
2665
2666 node_set_state(node, N_CPU); /* this node has a cpu */
2327 2667
2328 for_each_zone(zone) { 2668 for_each_zone(zone) {
2329 2669
@@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu)
2331 continue; 2671 continue;
2332 2672
2333 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2673 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2334 GFP_KERNEL, cpu_to_node(cpu)); 2674 GFP_KERNEL, node);
2335 if (!zone_pcp(zone, cpu)) 2675 if (!zone_pcp(zone, cpu))
2336 goto bad; 2676 goto bad;
2337 2677
@@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2444 * To use this new node's memory, further consideration will be 2784 * To use this new node's memory, further consideration will be
2445 * necessary. 2785 * necessary.
2446 */ 2786 */
2447 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2787 zone->wait_table = vmalloc(alloc_size);
2448 } 2788 }
2449 if (!zone->wait_table) 2789 if (!zone->wait_table)
2450 return -ENOMEM; 2790 return -ENOMEM;
@@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
2680 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3020 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2681 } 3021 }
2682 3022
2683 if (*start_pfn == -1UL) { 3023 if (*start_pfn == -1UL)
2684 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2685 *start_pfn = 0; 3024 *start_pfn = 0;
2686 }
2687 3025
2688 /* Push the node boundaries out if requested */ 3026 /* Push the node boundaries out if requested */
2689 account_node_boundary(nid, start_pfn, end_pfn); 3027 account_node_boundary(nid, start_pfn, end_pfn);
@@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
2901 realtotalpages); 3239 realtotalpages);
2902} 3240}
2903 3241
3242#ifndef CONFIG_SPARSEMEM
3243/*
3244 * Calculate the size of the zone->blockflags rounded to an unsigned long
3245 * Start by making sure zonesize is a multiple of pageblock_order by rounding
3246 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
3247 * round what is now in bits to nearest long in bits, then return it in
3248 * bytes.
3249 */
3250static unsigned long __init usemap_size(unsigned long zonesize)
3251{
3252 unsigned long usemapsize;
3253
3254 usemapsize = roundup(zonesize, pageblock_nr_pages);
3255 usemapsize = usemapsize >> pageblock_order;
3256 usemapsize *= NR_PAGEBLOCK_BITS;
3257 usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
3258
3259 return usemapsize / 8;
3260}
3261
3262static void __init setup_usemap(struct pglist_data *pgdat,
3263 struct zone *zone, unsigned long zonesize)
3264{
3265 unsigned long usemapsize = usemap_size(zonesize);
3266 zone->pageblock_flags = NULL;
3267 if (usemapsize) {
3268 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3269 memset(zone->pageblock_flags, 0, usemapsize);
3270 }
3271}
3272#else
3273static void inline setup_usemap(struct pglist_data *pgdat,
3274 struct zone *zone, unsigned long zonesize) {}
3275#endif /* CONFIG_SPARSEMEM */
3276
3277#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
3278/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
3279static inline void __init set_pageblock_order(unsigned int order)
3280{
3281 /* Check that pageblock_nr_pages has not already been setup */
3282 if (pageblock_order)
3283 return;
3284
3285 /*
3286 * Assume the largest contiguous order of interest is a huge page.
3287 * This value may be variable depending on boot parameters on IA64
3288 */
3289 pageblock_order = order;
3290}
3291#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3292
3293/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
3294#define set_pageblock_order(x) do {} while (0)
3295
3296#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
3297
2904/* 3298/*
2905 * Set up the zone data structures: 3299 * Set up the zone data structures:
2906 * - mark all pages reserved 3300 * - mark all pages reserved
@@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2981 if (!size) 3375 if (!size)
2982 continue; 3376 continue;
2983 3377
3378 set_pageblock_order(HUGETLB_PAGE_ORDER);
3379 setup_usemap(pgdat, zone, size);
2984 ret = init_currently_empty_zone(zone, zone_start_pfn, 3380 ret = init_currently_empty_zone(zone, zone_start_pfn,
2985 size, MEMMAP_EARLY); 3381 size, MEMMAP_EARLY);
2986 BUG_ON(ret); 3382 BUG_ON(ret);
@@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void)
3234 return max_pfn; 3630 return max_pfn;
3235} 3631}
3236 3632
3237unsigned long __init early_calculate_totalpages(void) 3633/*
3634 * early_calculate_totalpages()
3635 * Sum pages in active regions for movable zone.
3636 * Populate N_HIGH_MEMORY for calculating usable_nodes.
3637 */
3638static unsigned long __init early_calculate_totalpages(void)
3238{ 3639{
3239 int i; 3640 int i;
3240 unsigned long totalpages = 0; 3641 unsigned long totalpages = 0;
3241 3642
3242 for (i = 0; i < nr_nodemap_entries; i++) 3643 for (i = 0; i < nr_nodemap_entries; i++) {
3243 totalpages += early_node_map[i].end_pfn - 3644 unsigned long pages = early_node_map[i].end_pfn -
3244 early_node_map[i].start_pfn; 3645 early_node_map[i].start_pfn;
3245 3646 totalpages += pages;
3246 return totalpages; 3647 if (pages)
3648 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
3649 }
3650 return totalpages;
3247} 3651}
3248 3652
3249/* 3653/*
@@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3257 int i, nid; 3661 int i, nid;
3258 unsigned long usable_startpfn; 3662 unsigned long usable_startpfn;
3259 unsigned long kernelcore_node, kernelcore_remaining; 3663 unsigned long kernelcore_node, kernelcore_remaining;
3260 int usable_nodes = num_online_nodes(); 3664 unsigned long totalpages = early_calculate_totalpages();
3665 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3261 3666
3262 /* 3667 /*
3263 * If movablecore was specified, calculate what size of 3668 * If movablecore was specified, calculate what size of
@@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3268 * what movablecore would have allowed. 3673 * what movablecore would have allowed.
3269 */ 3674 */
3270 if (required_movablecore) { 3675 if (required_movablecore) {
3271 unsigned long totalpages = early_calculate_totalpages();
3272 unsigned long corepages; 3676 unsigned long corepages;
3273 3677
3274 /* 3678 /*
@@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3293restart: 3697restart:
3294 /* Spread kernelcore memory as evenly as possible throughout nodes */ 3698 /* Spread kernelcore memory as evenly as possible throughout nodes */
3295 kernelcore_node = required_kernelcore / usable_nodes; 3699 kernelcore_node = required_kernelcore / usable_nodes;
3296 for_each_online_node(nid) { 3700 for_each_node_state(nid, N_HIGH_MEMORY) {
3297 /* 3701 /*
3298 * Recalculate kernelcore_node if the division per node 3702 * Recalculate kernelcore_node if the division per node
3299 * now exceeds what is necessary to satisfy the requested 3703 * now exceeds what is necessary to satisfy the requested
@@ -3385,6 +3789,20 @@ restart:
3385 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 3789 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3386} 3790}
3387 3791
3792/* Any regular memory on that node ? */
3793static void check_for_regular_memory(pg_data_t *pgdat)
3794{
3795#ifdef CONFIG_HIGHMEM
3796 enum zone_type zone_type;
3797
3798 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
3799 struct zone *zone = &pgdat->node_zones[zone_type];
3800 if (zone->present_pages)
3801 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
3802 }
3803#endif
3804}
3805
3388/** 3806/**
3389 * free_area_init_nodes - Initialise all pg_data_t and zone data 3807 * free_area_init_nodes - Initialise all pg_data_t and zone data
3390 * @max_zone_pfn: an array of max PFNs for each zone 3808 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3459 pg_data_t *pgdat = NODE_DATA(nid); 3877 pg_data_t *pgdat = NODE_DATA(nid);
3460 free_area_init_node(nid, pgdat, NULL, 3878 free_area_init_node(nid, pgdat, NULL,
3461 find_min_pfn_for_node(nid), NULL); 3879 find_min_pfn_for_node(nid), NULL);
3880
3881 /* Any memory on that node */
3882 if (pgdat->node_present_pages)
3883 node_set_state(nid, N_HIGH_MEMORY);
3884 check_for_regular_memory(pgdat);
3462 } 3885 }
3463} 3886}
3464 3887
@@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void)
3673 4096
3674 zone->pages_low = zone->pages_min + (tmp >> 2); 4097 zone->pages_low = zone->pages_min + (tmp >> 2);
3675 zone->pages_high = zone->pages_min + (tmp >> 1); 4098 zone->pages_high = zone->pages_min + (tmp >> 1);
4099 setup_zone_migrate_reserve(zone);
3676 spin_unlock_irqrestore(&zone->lru_lock, flags); 4100 spin_unlock_irqrestore(&zone->lru_lock, flags);
3677 } 4101 }
3678 4102
@@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page);
3934EXPORT_SYMBOL(page_to_pfn); 4358EXPORT_SYMBOL(page_to_pfn);
3935#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 4359#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3936 4360
4361/* Return a pointer to the bitmap storing bits affecting a block of pages */
4362static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4363 unsigned long pfn)
4364{
4365#ifdef CONFIG_SPARSEMEM
4366 return __pfn_to_section(pfn)->pageblock_flags;
4367#else
4368 return zone->pageblock_flags;
4369#endif /* CONFIG_SPARSEMEM */
4370}
4371
4372static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
4373{
4374#ifdef CONFIG_SPARSEMEM
4375 pfn &= (PAGES_PER_SECTION-1);
4376 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4377#else
4378 pfn = pfn - zone->zone_start_pfn;
4379 return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
4380#endif /* CONFIG_SPARSEMEM */
4381}
4382
4383/**
4384 * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
4385 * @page: The page within the block of interest
4386 * @start_bitidx: The first bit of interest to retrieve
4387 * @end_bitidx: The last bit of interest
4388 * returns pageblock_bits flags
4389 */
4390unsigned long get_pageblock_flags_group(struct page *page,
4391 int start_bitidx, int end_bitidx)
4392{
4393 struct zone *zone;
4394 unsigned long *bitmap;
4395 unsigned long pfn, bitidx;
4396 unsigned long flags = 0;
4397 unsigned long value = 1;
4398
4399 zone = page_zone(page);
4400 pfn = page_to_pfn(page);
4401 bitmap = get_pageblock_bitmap(zone, pfn);
4402 bitidx = pfn_to_bitidx(zone, pfn);
4403
4404 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4405 if (test_bit(bitidx + start_bitidx, bitmap))
4406 flags |= value;
4407
4408 return flags;
4409}
3937 4410
4411/**
4412 * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
4413 * @page: The page within the block of interest
4414 * @start_bitidx: The first bit of interest
4415 * @end_bitidx: The last bit of interest
4416 * @flags: The flags to set
4417 */
4418void set_pageblock_flags_group(struct page *page, unsigned long flags,
4419 int start_bitidx, int end_bitidx)
4420{
4421 struct zone *zone;
4422 unsigned long *bitmap;
4423 unsigned long pfn, bitidx;
4424 unsigned long value = 1;
4425
4426 zone = page_zone(page);
4427 pfn = page_to_pfn(page);
4428 bitmap = get_pageblock_bitmap(zone, pfn);
4429 bitidx = pfn_to_bitidx(zone, pfn);
4430
4431 for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
4432 if (flags & value)
4433 __set_bit(bitidx + start_bitidx, bitmap);
4434 else
4435 __clear_bit(bitidx + start_bitidx, bitmap);
4436}
4437
4438/*
4439 * This is designed as sub function...plz see page_isolation.c also.
4440 * set/clear page block's type to be ISOLATE.
4441 * page allocater never alloc memory from ISOLATE block.
4442 */
4443
4444int set_migratetype_isolate(struct page *page)
4445{
4446 struct zone *zone;
4447 unsigned long flags;
4448 int ret = -EBUSY;
4449
4450 zone = page_zone(page);
4451 spin_lock_irqsave(&zone->lock, flags);
4452 /*
4453 * In future, more migrate types will be able to be isolation target.
4454 */
4455 if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
4456 goto out;
4457 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
4458 move_freepages_block(zone, page, MIGRATE_ISOLATE);
4459 ret = 0;
4460out:
4461 spin_unlock_irqrestore(&zone->lock, flags);
4462 if (!ret)
4463 drain_all_local_pages();
4464 return ret;
4465}
4466
4467void unset_migratetype_isolate(struct page *page)
4468{
4469 struct zone *zone;
4470 unsigned long flags;
4471 zone = page_zone(page);
4472 spin_lock_irqsave(&zone->lock, flags);
4473 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
4474 goto out;
4475 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
4476 move_freepages_block(zone, page, MIGRATE_MOVABLE);
4477out:
4478 spin_unlock_irqrestore(&zone->lock, flags);
4479}
4480
4481#ifdef CONFIG_MEMORY_HOTREMOVE
4482/*
4483 * All pages in the range must be isolated before calling this.
4484 */
4485void
4486__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
4487{
4488 struct page *page;
4489 struct zone *zone;
4490 int order, i;
4491 unsigned long pfn;
4492 unsigned long flags;
4493 /* find the first valid pfn */
4494 for (pfn = start_pfn; pfn < end_pfn; pfn++)
4495 if (pfn_valid(pfn))
4496 break;
4497 if (pfn == end_pfn)
4498 return;
4499 zone = page_zone(pfn_to_page(pfn));
4500 spin_lock_irqsave(&zone->lock, flags);
4501 pfn = start_pfn;
4502 while (pfn < end_pfn) {
4503 if (!pfn_valid(pfn)) {
4504 pfn++;
4505 continue;
4506 }
4507 page = pfn_to_page(pfn);
4508 BUG_ON(page_count(page));
4509 BUG_ON(!PageBuddy(page));
4510 order = page_order(page);
4511#ifdef CONFIG_DEBUG_VM
4512 printk(KERN_INFO "remove from free list %lx %d %lx\n",
4513 pfn, 1 << order, end_pfn);
4514#endif
4515 list_del(&page->lru);
4516 rmv_page_order(page);
4517 zone->free_area[order].nr_free--;
4518 __mod_zone_page_state(zone, NR_FREE_PAGES,
4519 - (1UL << order));
4520 for (i = 0; i < (1 << order); i++)
4521 SetPageReserved((page+i));
4522 pfn += (1 << order);
4523 }
4524 spin_unlock_irqrestore(&zone->lock, flags);
4525}
4526#endif