aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorDave Jones <davej@redhat.com>2006-12-12 18:13:32 -0500
committerDave Jones <davej@redhat.com>2006-12-12 18:13:32 -0500
commitf0eef25339f92f7cd4aeea23d9ae97987a5a1e82 (patch)
tree2472e94d39f43a9580a6d2d5d92de0b749023263 /mm/page_alloc.c
parent0cfea5dd98205f2fa318836da664a7d7df1afbc1 (diff)
parente1036502e5263851259d147771226161e5ccc85a (diff)
Merge ../linus
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c1156
1 files changed, 1030 insertions, 126 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db7..e6b17b2989e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,10 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
42#include <linux/backing-dev.h>
43#include <linux/fault-inject.h>
40 44
41#include <asm/tlbflush.h> 45#include <asm/tlbflush.h>
42#include <asm/div64.h> 46#include <asm/div64.h>
@@ -80,14 +84,7 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
80 84
81EXPORT_SYMBOL(totalram_pages); 85EXPORT_SYMBOL(totalram_pages);
82 86
83/* 87static char * const zone_names[MAX_NR_ZONES] = {
84 * Used by page_zone() to look up the address of the struct zone whose
85 * id is encoded in the upper bits of page->flags
86 */
87struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
88EXPORT_SYMBOL(zone_table);
89
90static char *zone_names[MAX_NR_ZONES] = {
91 "DMA", 88 "DMA",
92#ifdef CONFIG_ZONE_DMA32 89#ifdef CONFIG_ZONE_DMA32
93 "DMA32", 90 "DMA32",
@@ -102,6 +99,38 @@ int min_free_kbytes = 1024;
102 99
103unsigned long __meminitdata nr_kernel_pages; 100unsigned long __meminitdata nr_kernel_pages;
104unsigned long __meminitdata nr_all_pages; 101unsigned long __meminitdata nr_all_pages;
102static unsigned long __initdata dma_reserve;
103
104#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
105 /*
106 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
107 * ranges of memory (RAM) that may be registered with add_active_range().
108 * Ranges passed to add_active_range() will be merged if possible
109 * so the number of times add_active_range() can be called is
110 * related to the number of nodes and the number of holes
111 */
112 #ifdef CONFIG_MAX_ACTIVE_REGIONS
113 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
114 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
115 #else
116 #if MAX_NUMNODES >= 32
117 /* If there can be many nodes, allow up to 50 holes per node */
118 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
119 #else
120 /* By default, allow up to 256 distinct regions */
121 #define MAX_ACTIVE_REGIONS 256
122 #endif
123 #endif
124
125 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
126 int __initdata nr_nodemap_entries;
127 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
128 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
129#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
130 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
131 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
132#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
133#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
105 134
106#ifdef CONFIG_DEBUG_VM 135#ifdef CONFIG_DEBUG_VM
107static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 136static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -202,7 +231,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
202 int i; 231 int i;
203 int nr_pages = 1 << order; 232 int nr_pages = 1 << order;
204 233
205 page[1].lru.next = (void *)free_compound_page; /* set dtor */ 234 set_compound_page_dtor(page, free_compound_page);
206 page[1].lru.prev = (void *)order; 235 page[1].lru.prev = (void *)order;
207 for (i = 0; i < nr_pages; i++) { 236 for (i = 0; i < nr_pages; i++) {
208 struct page *p = page + i; 237 struct page *p = page + i;
@@ -451,7 +480,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order)
451 spin_lock(&zone->lock); 480 spin_lock(&zone->lock);
452 zone->all_unreclaimable = 0; 481 zone->all_unreclaimable = 0;
453 zone->pages_scanned = 0; 482 zone->pages_scanned = 0;
454 __free_one_page(page, zone ,order); 483 __free_one_page(page, zone, order);
455 spin_unlock(&zone->lock); 484 spin_unlock(&zone->lock);
456} 485}
457 486
@@ -461,17 +490,16 @@ static void __free_pages_ok(struct page *page, unsigned int order)
461 int i; 490 int i;
462 int reserved = 0; 491 int reserved = 0;
463 492
464 arch_free_page(page, order);
465 if (!PageHighMem(page))
466 debug_check_no_locks_freed(page_address(page),
467 PAGE_SIZE<<order);
468
469 for (i = 0 ; i < (1 << order) ; ++i) 493 for (i = 0 ; i < (1 << order) ; ++i)
470 reserved += free_pages_check(page + i); 494 reserved += free_pages_check(page + i);
471 if (reserved) 495 if (reserved)
472 return; 496 return;
473 497
498 if (!PageHighMem(page))
499 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
500 arch_free_page(page, order);
474 kernel_map_pages(page, 1 << order, 0); 501 kernel_map_pages(page, 1 << order, 0);
502
475 local_irq_save(flags); 503 local_irq_save(flags);
476 __count_vm_events(PGFREE, 1 << order); 504 __count_vm_events(PGFREE, 1 << order);
477 free_one_page(page_zone(page), page, order); 505 free_one_page(page_zone(page), page, order);
@@ -571,6 +599,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
571 1 << PG_checked | 1 << PG_mappedtodisk); 599 1 << PG_checked | 1 << PG_mappedtodisk);
572 set_page_private(page, 0); 600 set_page_private(page, 0);
573 set_page_refcounted(page); 601 set_page_refcounted(page);
602
603 arch_alloc_page(page, order);
574 kernel_map_pages(page, 1 << order, 1); 604 kernel_map_pages(page, 1 << order, 1);
575 605
576 if (gfp_flags & __GFP_ZERO) 606 if (gfp_flags & __GFP_ZERO)
@@ -656,9 +686,15 @@ void drain_node_pages(int nodeid)
656 686
657 pcp = &pset->pcp[i]; 687 pcp = &pset->pcp[i];
658 if (pcp->count) { 688 if (pcp->count) {
689 int to_drain;
690
659 local_irq_save(flags); 691 local_irq_save(flags);
660 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 692 if (pcp->count >= pcp->batch)
661 pcp->count = 0; 693 to_drain = pcp->batch;
694 else
695 to_drain = pcp->count;
696 free_pages_bulk(zone, to_drain, &pcp->list, 0);
697 pcp->count -= to_drain;
662 local_irq_restore(flags); 698 local_irq_restore(flags);
663 } 699 }
664 } 700 }
@@ -666,7 +702,6 @@ void drain_node_pages(int nodeid)
666} 702}
667#endif 703#endif
668 704
669#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
670static void __drain_pages(unsigned int cpu) 705static void __drain_pages(unsigned int cpu)
671{ 706{
672 unsigned long flags; 707 unsigned long flags;
@@ -688,7 +723,6 @@ static void __drain_pages(unsigned int cpu)
688 } 723 }
689 } 724 }
690} 725}
691#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
692 726
693#ifdef CONFIG_PM 727#ifdef CONFIG_PM
694 728
@@ -747,13 +781,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
747 struct per_cpu_pages *pcp; 781 struct per_cpu_pages *pcp;
748 unsigned long flags; 782 unsigned long flags;
749 783
750 arch_free_page(page, 0);
751
752 if (PageAnon(page)) 784 if (PageAnon(page))
753 page->mapping = NULL; 785 page->mapping = NULL;
754 if (free_pages_check(page)) 786 if (free_pages_check(page))
755 return; 787 return;
756 788
789 if (!PageHighMem(page))
790 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
791 arch_free_page(page, 0);
757 kernel_map_pages(page, 1, 0); 792 kernel_map_pages(page, 1, 0);
758 793
759 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 794 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
@@ -818,7 +853,7 @@ again:
818 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 853 pcp = &zone_pcp(zone, cpu)->pcp[cold];
819 local_irq_save(flags); 854 local_irq_save(flags);
820 if (!pcp->count) { 855 if (!pcp->count) {
821 pcp->count += rmqueue_bulk(zone, 0, 856 pcp->count = rmqueue_bulk(zone, 0,
822 pcp->batch, &pcp->list); 857 pcp->batch, &pcp->list);
823 if (unlikely(!pcp->count)) 858 if (unlikely(!pcp->count))
824 goto failed; 859 goto failed;
@@ -858,6 +893,91 @@ failed:
858#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 893#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
859#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 894#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
860 895
896#ifdef CONFIG_FAIL_PAGE_ALLOC
897
898static struct fail_page_alloc_attr {
899 struct fault_attr attr;
900
901 u32 ignore_gfp_highmem;
902 u32 ignore_gfp_wait;
903
904#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
905
906 struct dentry *ignore_gfp_highmem_file;
907 struct dentry *ignore_gfp_wait_file;
908
909#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
910
911} fail_page_alloc = {
912 .attr = FAULT_ATTR_INITIALIZER,
913 .ignore_gfp_wait = 1,
914 .ignore_gfp_highmem = 1,
915};
916
917static int __init setup_fail_page_alloc(char *str)
918{
919 return setup_fault_attr(&fail_page_alloc.attr, str);
920}
921__setup("fail_page_alloc=", setup_fail_page_alloc);
922
923static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
924{
925 if (gfp_mask & __GFP_NOFAIL)
926 return 0;
927 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
928 return 0;
929 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
930 return 0;
931
932 return should_fail(&fail_page_alloc.attr, 1 << order);
933}
934
935#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
936
937static int __init fail_page_alloc_debugfs(void)
938{
939 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
940 struct dentry *dir;
941 int err;
942
943 err = init_fault_attr_dentries(&fail_page_alloc.attr,
944 "fail_page_alloc");
945 if (err)
946 return err;
947 dir = fail_page_alloc.attr.dentries.dir;
948
949 fail_page_alloc.ignore_gfp_wait_file =
950 debugfs_create_bool("ignore-gfp-wait", mode, dir,
951 &fail_page_alloc.ignore_gfp_wait);
952
953 fail_page_alloc.ignore_gfp_highmem_file =
954 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
955 &fail_page_alloc.ignore_gfp_highmem);
956
957 if (!fail_page_alloc.ignore_gfp_wait_file ||
958 !fail_page_alloc.ignore_gfp_highmem_file) {
959 err = -ENOMEM;
960 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
961 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
962 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
963 }
964
965 return err;
966}
967
968late_initcall(fail_page_alloc_debugfs);
969
970#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
971
972#else /* CONFIG_FAIL_PAGE_ALLOC */
973
974static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
975{
976 return 0;
977}
978
979#endif /* CONFIG_FAIL_PAGE_ALLOC */
980
861/* 981/*
862 * Return 1 if free pages are above 'mark'. This takes into account the order 982 * Return 1 if free pages are above 'mark'. This takes into account the order
863 * of the allocation. 983 * of the allocation.
@@ -866,7 +986,8 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
866 int classzone_idx, int alloc_flags) 986 int classzone_idx, int alloc_flags)
867{ 987{
868 /* free_pages my go negative - that's OK */ 988 /* free_pages my go negative - that's OK */
869 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 989 unsigned long min = mark;
990 long free_pages = z->free_pages - (1 << order) + 1;
870 int o; 991 int o;
871 992
872 if (alloc_flags & ALLOC_HIGH) 993 if (alloc_flags & ALLOC_HIGH)
@@ -889,31 +1010,160 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
889 return 1; 1010 return 1;
890} 1011}
891 1012
1013#ifdef CONFIG_NUMA
892/* 1014/*
893 * get_page_from_freeliest goes through the zonelist trying to allocate 1015 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1016 * skip over zones that are not allowed by the cpuset, or that have
1017 * been recently (in last second) found to be nearly full. See further
1018 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1019 * that have to skip over alot of full or unallowed zones.
1020 *
1021 * If the zonelist cache is present in the passed in zonelist, then
1022 * returns a pointer to the allowed node mask (either the current
1023 * tasks mems_allowed, or node_online_map.)
1024 *
1025 * If the zonelist cache is not available for this zonelist, does
1026 * nothing and returns NULL.
1027 *
1028 * If the fullzones BITMAP in the zonelist cache is stale (more than
1029 * a second since last zap'd) then we zap it out (clear its bits.)
1030 *
1031 * We hold off even calling zlc_setup, until after we've checked the
1032 * first zone in the zonelist, on the theory that most allocations will
1033 * be satisfied from that first zone, so best to examine that zone as
1034 * quickly as we can.
1035 */
1036static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1037{
1038 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1039 nodemask_t *allowednodes; /* zonelist_cache approximation */
1040
1041 zlc = zonelist->zlcache_ptr;
1042 if (!zlc)
1043 return NULL;
1044
1045 if (jiffies - zlc->last_full_zap > 1 * HZ) {
1046 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1047 zlc->last_full_zap = jiffies;
1048 }
1049
1050 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1051 &cpuset_current_mems_allowed :
1052 &node_online_map;
1053 return allowednodes;
1054}
1055
1056/*
1057 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1058 * if it is worth looking at further for free memory:
1059 * 1) Check that the zone isn't thought to be full (doesn't have its
1060 * bit set in the zonelist_cache fullzones BITMAP).
1061 * 2) Check that the zones node (obtained from the zonelist_cache
1062 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1063 * Return true (non-zero) if zone is worth looking at further, or
1064 * else return false (zero) if it is not.
1065 *
1066 * This check -ignores- the distinction between various watermarks,
1067 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1068 * found to be full for any variation of these watermarks, it will
1069 * be considered full for up to one second by all requests, unless
1070 * we are so low on memory on all allowed nodes that we are forced
1071 * into the second scan of the zonelist.
1072 *
1073 * In the second scan we ignore this zonelist cache and exactly
1074 * apply the watermarks to all zones, even it is slower to do so.
1075 * We are low on memory in the second scan, and should leave no stone
1076 * unturned looking for a free page.
1077 */
1078static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1079 nodemask_t *allowednodes)
1080{
1081 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1082 int i; /* index of *z in zonelist zones */
1083 int n; /* node that zone *z is on */
1084
1085 zlc = zonelist->zlcache_ptr;
1086 if (!zlc)
1087 return 1;
1088
1089 i = z - zonelist->zones;
1090 n = zlc->z_to_n[i];
1091
1092 /* This zone is worth trying if it is allowed but not full */
1093 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1094}
1095
1096/*
1097 * Given 'z' scanning a zonelist, set the corresponding bit in
1098 * zlc->fullzones, so that subsequent attempts to allocate a page
1099 * from that zone don't waste time re-examining it.
1100 */
1101static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1102{
1103 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1104 int i; /* index of *z in zonelist zones */
1105
1106 zlc = zonelist->zlcache_ptr;
1107 if (!zlc)
1108 return;
1109
1110 i = z - zonelist->zones;
1111
1112 set_bit(i, zlc->fullzones);
1113}
1114
1115#else /* CONFIG_NUMA */
1116
1117static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1118{
1119 return NULL;
1120}
1121
1122static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1123 nodemask_t *allowednodes)
1124{
1125 return 1;
1126}
1127
1128static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1129{
1130}
1131#endif /* CONFIG_NUMA */
1132
1133/*
1134 * get_page_from_freelist goes through the zonelist trying to allocate
894 * a page. 1135 * a page.
895 */ 1136 */
896static struct page * 1137static struct page *
897get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1138get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
898 struct zonelist *zonelist, int alloc_flags) 1139 struct zonelist *zonelist, int alloc_flags)
899{ 1140{
900 struct zone **z = zonelist->zones; 1141 struct zone **z;
901 struct page *page = NULL; 1142 struct page *page = NULL;
902 int classzone_idx = zone_idx(*z); 1143 int classzone_idx = zone_idx(zonelist->zones[0]);
903 struct zone *zone; 1144 struct zone *zone;
1145 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1146 int zlc_active = 0; /* set if using zonelist_cache */
1147 int did_zlc_setup = 0; /* just call zlc_setup() one time */
904 1148
1149zonelist_scan:
905 /* 1150 /*
906 * Go through the zonelist once, looking for a zone with enough free. 1151 * Scan zonelist, looking for a zone with enough free.
907 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1152 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
908 */ 1153 */
1154 z = zonelist->zones;
1155
909 do { 1156 do {
1157 if (NUMA_BUILD && zlc_active &&
1158 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1159 continue;
910 zone = *z; 1160 zone = *z;
911 if (unlikely((gfp_mask & __GFP_THISNODE) && 1161 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
912 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1162 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
913 break; 1163 break;
914 if ((alloc_flags & ALLOC_CPUSET) && 1164 if ((alloc_flags & ALLOC_CPUSET) &&
915 !cpuset_zone_allowed(zone, gfp_mask)) 1165 !cpuset_zone_allowed(zone, gfp_mask))
916 continue; 1166 goto try_next_zone;
917 1167
918 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1168 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
919 unsigned long mark; 1169 unsigned long mark;
@@ -923,18 +1173,34 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
923 mark = zone->pages_low; 1173 mark = zone->pages_low;
924 else 1174 else
925 mark = zone->pages_high; 1175 mark = zone->pages_high;
926 if (!zone_watermark_ok(zone , order, mark, 1176 if (!zone_watermark_ok(zone, order, mark,
927 classzone_idx, alloc_flags)) 1177 classzone_idx, alloc_flags)) {
928 if (!zone_reclaim_mode || 1178 if (!zone_reclaim_mode ||
929 !zone_reclaim(zone, gfp_mask, order)) 1179 !zone_reclaim(zone, gfp_mask, order))
930 continue; 1180 goto this_zone_full;
1181 }
931 } 1182 }
932 1183
933 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1184 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
934 if (page) { 1185 if (page)
935 break; 1186 break;
1187this_zone_full:
1188 if (NUMA_BUILD)
1189 zlc_mark_zone_full(zonelist, z);
1190try_next_zone:
1191 if (NUMA_BUILD && !did_zlc_setup) {
1192 /* we do zlc_setup after the first zone is tried */
1193 allowednodes = zlc_setup(zonelist, alloc_flags);
1194 zlc_active = 1;
1195 did_zlc_setup = 1;
936 } 1196 }
937 } while (*(++z) != NULL); 1197 } while (*(++z) != NULL);
1198
1199 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1200 /* Disable zlc cache for second zonelist scan */
1201 zlc_active = 0;
1202 goto zonelist_scan;
1203 }
938 return page; 1204 return page;
939} 1205}
940 1206
@@ -956,6 +1222,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
956 1222
957 might_sleep_if(wait); 1223 might_sleep_if(wait);
958 1224
1225 if (should_fail_alloc_page(gfp_mask, order))
1226 return NULL;
1227
959restart: 1228restart:
960 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1229 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
961 1230
@@ -969,9 +1238,19 @@ restart:
969 if (page) 1238 if (page)
970 goto got_pg; 1239 goto got_pg;
971 1240
972 do { 1241 /*
1242 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1243 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1244 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1245 * using a larger set of nodes after it has established that the
1246 * allowed per node queues are empty and that nodes are
1247 * over allocated.
1248 */
1249 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1250 goto nopage;
1251
1252 for (z = zonelist->zones; *z; z++)
973 wakeup_kswapd(*z, order); 1253 wakeup_kswapd(*z, order);
974 } while (*(++z));
975 1254
976 /* 1255 /*
977 * OK, we're below the kswapd watermark and have kicked background 1256 * OK, we're below the kswapd watermark and have kicked background
@@ -1005,6 +1284,7 @@ restart:
1005 1284
1006 /* This allocation should allow future memory freeing. */ 1285 /* This allocation should allow future memory freeing. */
1007 1286
1287rebalance:
1008 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1288 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1009 && !in_interrupt()) { 1289 && !in_interrupt()) {
1010 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1290 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
@@ -1015,7 +1295,7 @@ nofail_alloc:
1015 if (page) 1295 if (page)
1016 goto got_pg; 1296 goto got_pg;
1017 if (gfp_mask & __GFP_NOFAIL) { 1297 if (gfp_mask & __GFP_NOFAIL) {
1018 blk_congestion_wait(WRITE, HZ/50); 1298 congestion_wait(WRITE, HZ/50);
1019 goto nofail_alloc; 1299 goto nofail_alloc;
1020 } 1300 }
1021 } 1301 }
@@ -1026,7 +1306,6 @@ nofail_alloc:
1026 if (!wait) 1306 if (!wait)
1027 goto nopage; 1307 goto nopage;
1028 1308
1029rebalance:
1030 cond_resched(); 1309 cond_resched();
1031 1310
1032 /* We now go into synchronous reclaim */ 1311 /* We now go into synchronous reclaim */
@@ -1078,7 +1357,7 @@ rebalance:
1078 do_retry = 1; 1357 do_retry = 1;
1079 } 1358 }
1080 if (do_retry) { 1359 if (do_retry) {
1081 blk_congestion_wait(WRITE, HZ/50); 1360 congestion_wait(WRITE, HZ/50);
1082 goto rebalance; 1361 goto rebalance;
1083 } 1362 }
1084 1363
@@ -1222,14 +1501,12 @@ unsigned int nr_free_pagecache_pages(void)
1222{ 1501{
1223 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1502 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1224} 1503}
1225#ifdef CONFIG_NUMA 1504
1226static void show_node(struct zone *zone) 1505static inline void show_node(struct zone *zone)
1227{ 1506{
1228 printk("Node %ld ", zone_to_nid(zone)); 1507 if (NUMA_BUILD)
1508 printk("Node %d ", zone_to_nid(zone));
1229} 1509}
1230#else
1231#define show_node(zone) do { } while (0)
1232#endif
1233 1510
1234void si_meminfo(struct sysinfo *val) 1511void si_meminfo(struct sysinfo *val)
1235{ 1512{
@@ -1271,34 +1548,30 @@ void si_meminfo_node(struct sysinfo *val, int nid)
1271 */ 1548 */
1272void show_free_areas(void) 1549void show_free_areas(void)
1273{ 1550{
1274 int cpu, temperature; 1551 int cpu;
1275 unsigned long active; 1552 unsigned long active;
1276 unsigned long inactive; 1553 unsigned long inactive;
1277 unsigned long free; 1554 unsigned long free;
1278 struct zone *zone; 1555 struct zone *zone;
1279 1556
1280 for_each_zone(zone) { 1557 for_each_zone(zone) {
1281 show_node(zone); 1558 if (!populated_zone(zone))
1282 printk("%s per-cpu:", zone->name);
1283
1284 if (!populated_zone(zone)) {
1285 printk(" empty\n");
1286 continue; 1559 continue;
1287 } else 1560
1288 printk("\n"); 1561 show_node(zone);
1562 printk("%s per-cpu:\n", zone->name);
1289 1563
1290 for_each_online_cpu(cpu) { 1564 for_each_online_cpu(cpu) {
1291 struct per_cpu_pageset *pageset; 1565 struct per_cpu_pageset *pageset;
1292 1566
1293 pageset = zone_pcp(zone, cpu); 1567 pageset = zone_pcp(zone, cpu);
1294 1568
1295 for (temperature = 0; temperature < 2; temperature++) 1569 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1296 printk("cpu %d %s: high %d, batch %d used:%d\n", 1570 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1297 cpu, 1571 cpu, pageset->pcp[0].high,
1298 temperature ? "cold" : "hot", 1572 pageset->pcp[0].batch, pageset->pcp[0].count,
1299 pageset->pcp[temperature].high, 1573 pageset->pcp[1].high, pageset->pcp[1].batch,
1300 pageset->pcp[temperature].batch, 1574 pageset->pcp[1].count);
1301 pageset->pcp[temperature].count);
1302 } 1575 }
1303 } 1576 }
1304 1577
@@ -1320,6 +1593,9 @@ void show_free_areas(void)
1320 for_each_zone(zone) { 1593 for_each_zone(zone) {
1321 int i; 1594 int i;
1322 1595
1596 if (!populated_zone(zone))
1597 continue;
1598
1323 show_node(zone); 1599 show_node(zone);
1324 printk("%s" 1600 printk("%s"
1325 " free:%lukB" 1601 " free:%lukB"
@@ -1352,12 +1628,11 @@ void show_free_areas(void)
1352 for_each_zone(zone) { 1628 for_each_zone(zone) {
1353 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1629 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1354 1630
1631 if (!populated_zone(zone))
1632 continue;
1633
1355 show_node(zone); 1634 show_node(zone);
1356 printk("%s: ", zone->name); 1635 printk("%s: ", zone->name);
1357 if (!populated_zone(zone)) {
1358 printk("empty\n");
1359 continue;
1360 }
1361 1636
1362 spin_lock_irqsave(&zone->lock, flags); 1637 spin_lock_irqsave(&zone->lock, flags);
1363 for (order = 0; order < MAX_ORDER; order++) { 1638 for (order = 0; order < MAX_ORDER; order++) {
@@ -1510,6 +1785,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1510 } 1785 }
1511} 1786}
1512 1787
1788/* Construct the zonelist performance cache - see further mmzone.h */
1789static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1790{
1791 int i;
1792
1793 for (i = 0; i < MAX_NR_ZONES; i++) {
1794 struct zonelist *zonelist;
1795 struct zonelist_cache *zlc;
1796 struct zone **z;
1797
1798 zonelist = pgdat->node_zonelists + i;
1799 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1800 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1801 for (z = zonelist->zones; *z; z++)
1802 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1803 }
1804}
1805
1513#else /* CONFIG_NUMA */ 1806#else /* CONFIG_NUMA */
1514 1807
1515static void __meminit build_zonelists(pg_data_t *pgdat) 1808static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1547,21 +1840,33 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1547 } 1840 }
1548} 1841}
1549 1842
1843/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1844static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1845{
1846 int i;
1847
1848 for (i = 0; i < MAX_NR_ZONES; i++)
1849 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1850}
1851
1550#endif /* CONFIG_NUMA */ 1852#endif /* CONFIG_NUMA */
1551 1853
1552/* return values int ....just for stop_machine_run() */ 1854/* return values int ....just for stop_machine_run() */
1553static int __meminit __build_all_zonelists(void *dummy) 1855static int __meminit __build_all_zonelists(void *dummy)
1554{ 1856{
1555 int nid; 1857 int nid;
1556 for_each_online_node(nid) 1858
1859 for_each_online_node(nid) {
1557 build_zonelists(NODE_DATA(nid)); 1860 build_zonelists(NODE_DATA(nid));
1861 build_zonelist_cache(NODE_DATA(nid));
1862 }
1558 return 0; 1863 return 0;
1559} 1864}
1560 1865
1561void __meminit build_all_zonelists(void) 1866void __meminit build_all_zonelists(void)
1562{ 1867{
1563 if (system_state == SYSTEM_BOOTING) { 1868 if (system_state == SYSTEM_BOOTING) {
1564 __build_all_zonelists(0); 1869 __build_all_zonelists(NULL);
1565 cpuset_init_current_mems_allowed(); 1870 cpuset_init_current_mems_allowed();
1566 } else { 1871 } else {
1567 /* we have to stop all cpus to guaranntee there is no user 1872 /* we have to stop all cpus to guaranntee there is no user
@@ -1642,25 +1947,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1642 1947
1643#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1948#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1644 1949
1645static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1646 unsigned long *zones_size, unsigned long *zholes_size)
1647{
1648 unsigned long realtotalpages, totalpages = 0;
1649 enum zone_type i;
1650
1651 for (i = 0; i < MAX_NR_ZONES; i++)
1652 totalpages += zones_size[i];
1653 pgdat->node_spanned_pages = totalpages;
1654
1655 realtotalpages = totalpages;
1656 if (zholes_size)
1657 for (i = 0; i < MAX_NR_ZONES; i++)
1658 realtotalpages -= zholes_size[i];
1659 pgdat->node_present_pages = realtotalpages;
1660 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1661}
1662
1663
1664/* 1950/*
1665 * Initially all pages are reserved - free ones are freed 1951 * Initially all pages are reserved - free ones are freed
1666 * up by free_all_bootmem() once the early boot process is 1952 * up by free_all_bootmem() once the early boot process is
@@ -1676,6 +1962,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1676 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1962 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1677 if (!early_pfn_valid(pfn)) 1963 if (!early_pfn_valid(pfn))
1678 continue; 1964 continue;
1965 if (!early_pfn_in_nid(pfn, nid))
1966 continue;
1679 page = pfn_to_page(pfn); 1967 page = pfn_to_page(pfn);
1680 set_page_links(page, zone, nid, pfn); 1968 set_page_links(page, zone, nid, pfn);
1681 init_page_count(page); 1969 init_page_count(page);
@@ -1700,20 +1988,6 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1700 } 1988 }
1701} 1989}
1702 1990
1703#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1704void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
1705 unsigned long pfn, unsigned long size)
1706{
1707 unsigned long snum = pfn_to_section_nr(pfn);
1708 unsigned long end = pfn_to_section_nr(pfn + size);
1709
1710 if (FLAGS_HAS_NODE)
1711 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1712 else
1713 for (; snum <= end; snum++)
1714 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1715}
1716
1717#ifndef __HAVE_ARCH_MEMMAP_INIT 1991#ifndef __HAVE_ARCH_MEMMAP_INIT
1718#define memmap_init(size, nid, zone, start_pfn) \ 1992#define memmap_init(size, nid, zone, start_pfn) \
1719 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1993 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1818,6 +2092,9 @@ static int __cpuinit process_zones(int cpu)
1818 2092
1819 for_each_zone(zone) { 2093 for_each_zone(zone) {
1820 2094
2095 if (!populated_zone(zone))
2096 continue;
2097
1821 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2098 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1822 GFP_KERNEL, cpu_to_node(cpu)); 2099 GFP_KERNEL, cpu_to_node(cpu));
1823 if (!zone_pcp(zone, cpu)) 2100 if (!zone_pcp(zone, cpu))
@@ -1863,16 +2140,16 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
1863 int ret = NOTIFY_OK; 2140 int ret = NOTIFY_OK;
1864 2141
1865 switch (action) { 2142 switch (action) {
1866 case CPU_UP_PREPARE: 2143 case CPU_UP_PREPARE:
1867 if (process_zones(cpu)) 2144 if (process_zones(cpu))
1868 ret = NOTIFY_BAD; 2145 ret = NOTIFY_BAD;
1869 break; 2146 break;
1870 case CPU_UP_CANCELED: 2147 case CPU_UP_CANCELED:
1871 case CPU_DEAD: 2148 case CPU_DEAD:
1872 free_zone_pagesets(cpu); 2149 free_zone_pagesets(cpu);
1873 break; 2150 break;
1874 default: 2151 default:
1875 break; 2152 break;
1876 } 2153 }
1877 return ret; 2154 return ret;
1878} 2155}
@@ -1977,6 +2254,349 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1977 return 0; 2254 return 0;
1978} 2255}
1979 2256
2257#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2258/*
2259 * Basic iterator support. Return the first range of PFNs for a node
2260 * Note: nid == MAX_NUMNODES returns first region regardless of node
2261 */
2262static int __init first_active_region_index_in_nid(int nid)
2263{
2264 int i;
2265
2266 for (i = 0; i < nr_nodemap_entries; i++)
2267 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2268 return i;
2269
2270 return -1;
2271}
2272
2273/*
2274 * Basic iterator support. Return the next active range of PFNs for a node
2275 * Note: nid == MAX_NUMNODES returns next region regardles of node
2276 */
2277static int __init next_active_region_index_in_nid(int index, int nid)
2278{
2279 for (index = index + 1; index < nr_nodemap_entries; index++)
2280 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2281 return index;
2282
2283 return -1;
2284}
2285
2286#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2287/*
2288 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2289 * Architectures may implement their own version but if add_active_range()
2290 * was used and there are no special requirements, this is a convenient
2291 * alternative
2292 */
2293int __init early_pfn_to_nid(unsigned long pfn)
2294{
2295 int i;
2296
2297 for (i = 0; i < nr_nodemap_entries; i++) {
2298 unsigned long start_pfn = early_node_map[i].start_pfn;
2299 unsigned long end_pfn = early_node_map[i].end_pfn;
2300
2301 if (start_pfn <= pfn && pfn < end_pfn)
2302 return early_node_map[i].nid;
2303 }
2304
2305 return 0;
2306}
2307#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2308
2309/* Basic iterator support to walk early_node_map[] */
2310#define for_each_active_range_index_in_nid(i, nid) \
2311 for (i = first_active_region_index_in_nid(nid); i != -1; \
2312 i = next_active_region_index_in_nid(i, nid))
2313
2314/**
2315 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2316 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2317 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2318 *
2319 * If an architecture guarantees that all ranges registered with
2320 * add_active_ranges() contain no holes and may be freed, this
2321 * this function may be used instead of calling free_bootmem() manually.
2322 */
2323void __init free_bootmem_with_active_regions(int nid,
2324 unsigned long max_low_pfn)
2325{
2326 int i;
2327
2328 for_each_active_range_index_in_nid(i, nid) {
2329 unsigned long size_pages = 0;
2330 unsigned long end_pfn = early_node_map[i].end_pfn;
2331
2332 if (early_node_map[i].start_pfn >= max_low_pfn)
2333 continue;
2334
2335 if (end_pfn > max_low_pfn)
2336 end_pfn = max_low_pfn;
2337
2338 size_pages = end_pfn - early_node_map[i].start_pfn;
2339 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2340 PFN_PHYS(early_node_map[i].start_pfn),
2341 size_pages << PAGE_SHIFT);
2342 }
2343}
2344
2345/**
2346 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2347 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2348 *
2349 * If an architecture guarantees that all ranges registered with
2350 * add_active_ranges() contain no holes and may be freed, this
2351 * function may be used instead of calling memory_present() manually.
2352 */
2353void __init sparse_memory_present_with_active_regions(int nid)
2354{
2355 int i;
2356
2357 for_each_active_range_index_in_nid(i, nid)
2358 memory_present(early_node_map[i].nid,
2359 early_node_map[i].start_pfn,
2360 early_node_map[i].end_pfn);
2361}
2362
2363/**
2364 * push_node_boundaries - Push node boundaries to at least the requested boundary
2365 * @nid: The nid of the node to push the boundary for
2366 * @start_pfn: The start pfn of the node
2367 * @end_pfn: The end pfn of the node
2368 *
2369 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2370 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2371 * be hotplugged even though no physical memory exists. This function allows
2372 * an arch to push out the node boundaries so mem_map is allocated that can
2373 * be used later.
2374 */
2375#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2376void __init push_node_boundaries(unsigned int nid,
2377 unsigned long start_pfn, unsigned long end_pfn)
2378{
2379 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2380 nid, start_pfn, end_pfn);
2381
2382 /* Initialise the boundary for this node if necessary */
2383 if (node_boundary_end_pfn[nid] == 0)
2384 node_boundary_start_pfn[nid] = -1UL;
2385
2386 /* Update the boundaries */
2387 if (node_boundary_start_pfn[nid] > start_pfn)
2388 node_boundary_start_pfn[nid] = start_pfn;
2389 if (node_boundary_end_pfn[nid] < end_pfn)
2390 node_boundary_end_pfn[nid] = end_pfn;
2391}
2392
2393/* If necessary, push the node boundary out for reserve hotadd */
2394static void __init account_node_boundary(unsigned int nid,
2395 unsigned long *start_pfn, unsigned long *end_pfn)
2396{
2397 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2398 nid, *start_pfn, *end_pfn);
2399
2400 /* Return if boundary information has not been provided */
2401 if (node_boundary_end_pfn[nid] == 0)
2402 return;
2403
2404 /* Check the boundaries and update if necessary */
2405 if (node_boundary_start_pfn[nid] < *start_pfn)
2406 *start_pfn = node_boundary_start_pfn[nid];
2407 if (node_boundary_end_pfn[nid] > *end_pfn)
2408 *end_pfn = node_boundary_end_pfn[nid];
2409}
2410#else
2411void __init push_node_boundaries(unsigned int nid,
2412 unsigned long start_pfn, unsigned long end_pfn) {}
2413
2414static void __init account_node_boundary(unsigned int nid,
2415 unsigned long *start_pfn, unsigned long *end_pfn) {}
2416#endif
2417
2418
2419/**
2420 * get_pfn_range_for_nid - Return the start and end page frames for a node
2421 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
2422 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
2423 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
2424 *
2425 * It returns the start and end page frame of a node based on information
2426 * provided by an arch calling add_active_range(). If called for a node
2427 * with no available memory, a warning is printed and the start and end
2428 * PFNs will be 0.
2429 */
2430void __init get_pfn_range_for_nid(unsigned int nid,
2431 unsigned long *start_pfn, unsigned long *end_pfn)
2432{
2433 int i;
2434 *start_pfn = -1UL;
2435 *end_pfn = 0;
2436
2437 for_each_active_range_index_in_nid(i, nid) {
2438 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2439 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2440 }
2441
2442 if (*start_pfn == -1UL) {
2443 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2444 *start_pfn = 0;
2445 }
2446
2447 /* Push the node boundaries out if requested */
2448 account_node_boundary(nid, start_pfn, end_pfn);
2449}
2450
2451/*
2452 * Return the number of pages a zone spans in a node, including holes
2453 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2454 */
2455unsigned long __init zone_spanned_pages_in_node(int nid,
2456 unsigned long zone_type,
2457 unsigned long *ignored)
2458{
2459 unsigned long node_start_pfn, node_end_pfn;
2460 unsigned long zone_start_pfn, zone_end_pfn;
2461
2462 /* Get the start and end of the node and zone */
2463 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2464 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2465 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2466
2467 /* Check that this node has pages within the zone's required range */
2468 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2469 return 0;
2470
2471 /* Move the zone boundaries inside the node if necessary */
2472 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2473 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2474
2475 /* Return the spanned pages */
2476 return zone_end_pfn - zone_start_pfn;
2477}
2478
2479/*
2480 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2481 * then all holes in the requested range will be accounted for.
2482 */
2483unsigned long __init __absent_pages_in_range(int nid,
2484 unsigned long range_start_pfn,
2485 unsigned long range_end_pfn)
2486{
2487 int i = 0;
2488 unsigned long prev_end_pfn = 0, hole_pages = 0;
2489 unsigned long start_pfn;
2490
2491 /* Find the end_pfn of the first active range of pfns in the node */
2492 i = first_active_region_index_in_nid(nid);
2493 if (i == -1)
2494 return 0;
2495
2496 /* Account for ranges before physical memory on this node */
2497 if (early_node_map[i].start_pfn > range_start_pfn)
2498 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2499
2500 prev_end_pfn = early_node_map[i].start_pfn;
2501
2502 /* Find all holes for the zone within the node */
2503 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2504
2505 /* No need to continue if prev_end_pfn is outside the zone */
2506 if (prev_end_pfn >= range_end_pfn)
2507 break;
2508
2509 /* Make sure the end of the zone is not within the hole */
2510 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2511 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2512
2513 /* Update the hole size cound and move on */
2514 if (start_pfn > range_start_pfn) {
2515 BUG_ON(prev_end_pfn > start_pfn);
2516 hole_pages += start_pfn - prev_end_pfn;
2517 }
2518 prev_end_pfn = early_node_map[i].end_pfn;
2519 }
2520
2521 /* Account for ranges past physical memory on this node */
2522 if (range_end_pfn > prev_end_pfn)
2523 hole_pages += range_end_pfn -
2524 max(range_start_pfn, prev_end_pfn);
2525
2526 return hole_pages;
2527}
2528
2529/**
2530 * absent_pages_in_range - Return number of page frames in holes within a range
2531 * @start_pfn: The start PFN to start searching for holes
2532 * @end_pfn: The end PFN to stop searching for holes
2533 *
2534 * It returns the number of pages frames in memory holes within a range.
2535 */
2536unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2537 unsigned long end_pfn)
2538{
2539 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2540}
2541
2542/* Return the number of page frames in holes in a zone on a node */
2543unsigned long __init zone_absent_pages_in_node(int nid,
2544 unsigned long zone_type,
2545 unsigned long *ignored)
2546{
2547 unsigned long node_start_pfn, node_end_pfn;
2548 unsigned long zone_start_pfn, zone_end_pfn;
2549
2550 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2551 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2552 node_start_pfn);
2553 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2554 node_end_pfn);
2555
2556 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2557}
2558
2559#else
2560static inline unsigned long zone_spanned_pages_in_node(int nid,
2561 unsigned long zone_type,
2562 unsigned long *zones_size)
2563{
2564 return zones_size[zone_type];
2565}
2566
2567static inline unsigned long zone_absent_pages_in_node(int nid,
2568 unsigned long zone_type,
2569 unsigned long *zholes_size)
2570{
2571 if (!zholes_size)
2572 return 0;
2573
2574 return zholes_size[zone_type];
2575}
2576
2577#endif
2578
2579static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2580 unsigned long *zones_size, unsigned long *zholes_size)
2581{
2582 unsigned long realtotalpages, totalpages = 0;
2583 enum zone_type i;
2584
2585 for (i = 0; i < MAX_NR_ZONES; i++)
2586 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2587 zones_size);
2588 pgdat->node_spanned_pages = totalpages;
2589
2590 realtotalpages = totalpages;
2591 for (i = 0; i < MAX_NR_ZONES; i++)
2592 realtotalpages -=
2593 zone_absent_pages_in_node(pgdat->node_id, i,
2594 zholes_size);
2595 pgdat->node_present_pages = realtotalpages;
2596 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2597 realtotalpages);
2598}
2599
1980/* 2600/*
1981 * Set up the zone data structures: 2601 * Set up the zone data structures:
1982 * - mark all pages reserved 2602 * - mark all pages reserved
@@ -1998,11 +2618,34 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
1998 2618
1999 for (j = 0; j < MAX_NR_ZONES; j++) { 2619 for (j = 0; j < MAX_NR_ZONES; j++) {
2000 struct zone *zone = pgdat->node_zones + j; 2620 struct zone *zone = pgdat->node_zones + j;
2001 unsigned long size, realsize; 2621 unsigned long size, realsize, memmap_pages;
2002 2622
2003 realsize = size = zones_size[j]; 2623 size = zone_spanned_pages_in_node(nid, j, zones_size);
2004 if (zholes_size) 2624 realsize = size - zone_absent_pages_in_node(nid, j,
2005 realsize -= zholes_size[j]; 2625 zholes_size);
2626
2627 /*
2628 * Adjust realsize so that it accounts for how much memory
2629 * is used by this zone for memmap. This affects the watermark
2630 * and per-cpu initialisations
2631 */
2632 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2633 if (realsize >= memmap_pages) {
2634 realsize -= memmap_pages;
2635 printk(KERN_DEBUG
2636 " %s zone: %lu pages used for memmap\n",
2637 zone_names[j], memmap_pages);
2638 } else
2639 printk(KERN_WARNING
2640 " %s zone: %lu pages exceeds realsize %lu\n",
2641 zone_names[j], memmap_pages, realsize);
2642
2643 /* Account for reserved DMA pages */
2644 if (j == ZONE_DMA && realsize > dma_reserve) {
2645 realsize -= dma_reserve;
2646 printk(KERN_DEBUG " DMA zone: %lu pages reserved\n",
2647 dma_reserve);
2648 }
2006 2649
2007 if (!is_highmem_idx(j)) 2650 if (!is_highmem_idx(j))
2008 nr_kernel_pages += realsize; 2651 nr_kernel_pages += realsize;
@@ -2011,6 +2654,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2011 zone->spanned_pages = size; 2654 zone->spanned_pages = size;
2012 zone->present_pages = realsize; 2655 zone->present_pages = realsize;
2013#ifdef CONFIG_NUMA 2656#ifdef CONFIG_NUMA
2657 zone->node = nid;
2014 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 2658 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2015 / 100; 2659 / 100;
2016 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 2660 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
@@ -2022,7 +2666,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2022 zone->zone_pgdat = pgdat; 2666 zone->zone_pgdat = pgdat;
2023 zone->free_pages = 0; 2667 zone->free_pages = 0;
2024 2668
2025 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 2669 zone->prev_priority = DEF_PRIORITY;
2026 2670
2027 zone_pcp_init(zone); 2671 zone_pcp_init(zone);
2028 INIT_LIST_HEAD(&zone->active_list); 2672 INIT_LIST_HEAD(&zone->active_list);
@@ -2036,7 +2680,6 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2036 if (!size) 2680 if (!size)
2037 continue; 2681 continue;
2038 2682
2039 zonetable_add(zone, nid, j, zone_start_pfn, size);
2040 ret = init_currently_empty_zone(zone, zone_start_pfn, size); 2683 ret = init_currently_empty_zone(zone, zone_start_pfn, size);
2041 BUG_ON(ret); 2684 BUG_ON(ret);
2042 zone_start_pfn += size; 2685 zone_start_pfn += size;
@@ -2073,8 +2716,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2073 /* 2716 /*
2074 * With no DISCONTIG, the global mem_map is just set as node 0's 2717 * With no DISCONTIG, the global mem_map is just set as node 0's
2075 */ 2718 */
2076 if (pgdat == NODE_DATA(0)) 2719 if (pgdat == NODE_DATA(0)) {
2077 mem_map = NODE_DATA(0)->node_mem_map; 2720 mem_map = NODE_DATA(0)->node_mem_map;
2721#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2722 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2723 mem_map -= pgdat->node_start_pfn;
2724#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2725 }
2078#endif 2726#endif
2079#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2727#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2080} 2728}
@@ -2085,13 +2733,254 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2085{ 2733{
2086 pgdat->node_id = nid; 2734 pgdat->node_id = nid;
2087 pgdat->node_start_pfn = node_start_pfn; 2735 pgdat->node_start_pfn = node_start_pfn;
2088 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2736 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2089 2737
2090 alloc_node_mem_map(pgdat); 2738 alloc_node_mem_map(pgdat);
2091 2739
2092 free_area_init_core(pgdat, zones_size, zholes_size); 2740 free_area_init_core(pgdat, zones_size, zholes_size);
2093} 2741}
2094 2742
2743#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2744/**
2745 * add_active_range - Register a range of PFNs backed by physical memory
2746 * @nid: The node ID the range resides on
2747 * @start_pfn: The start PFN of the available physical memory
2748 * @end_pfn: The end PFN of the available physical memory
2749 *
2750 * These ranges are stored in an early_node_map[] and later used by
2751 * free_area_init_nodes() to calculate zone sizes and holes. If the
2752 * range spans a memory hole, it is up to the architecture to ensure
2753 * the memory is not freed by the bootmem allocator. If possible
2754 * the range being registered will be merged with existing ranges.
2755 */
2756void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2757 unsigned long end_pfn)
2758{
2759 int i;
2760
2761 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2762 "%d entries of %d used\n",
2763 nid, start_pfn, end_pfn,
2764 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2765
2766 /* Merge with existing active regions if possible */
2767 for (i = 0; i < nr_nodemap_entries; i++) {
2768 if (early_node_map[i].nid != nid)
2769 continue;
2770
2771 /* Skip if an existing region covers this new one */
2772 if (start_pfn >= early_node_map[i].start_pfn &&
2773 end_pfn <= early_node_map[i].end_pfn)
2774 return;
2775
2776 /* Merge forward if suitable */
2777 if (start_pfn <= early_node_map[i].end_pfn &&
2778 end_pfn > early_node_map[i].end_pfn) {
2779 early_node_map[i].end_pfn = end_pfn;
2780 return;
2781 }
2782
2783 /* Merge backward if suitable */
2784 if (start_pfn < early_node_map[i].end_pfn &&
2785 end_pfn >= early_node_map[i].start_pfn) {
2786 early_node_map[i].start_pfn = start_pfn;
2787 return;
2788 }
2789 }
2790
2791 /* Check that early_node_map is large enough */
2792 if (i >= MAX_ACTIVE_REGIONS) {
2793 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2794 MAX_ACTIVE_REGIONS);
2795 return;
2796 }
2797
2798 early_node_map[i].nid = nid;
2799 early_node_map[i].start_pfn = start_pfn;
2800 early_node_map[i].end_pfn = end_pfn;
2801 nr_nodemap_entries = i + 1;
2802}
2803
2804/**
2805 * shrink_active_range - Shrink an existing registered range of PFNs
2806 * @nid: The node id the range is on that should be shrunk
2807 * @old_end_pfn: The old end PFN of the range
2808 * @new_end_pfn: The new PFN of the range
2809 *
2810 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2811 * The map is kept at the end physical page range that has already been
2812 * registered with add_active_range(). This function allows an arch to shrink
2813 * an existing registered range.
2814 */
2815void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2816 unsigned long new_end_pfn)
2817{
2818 int i;
2819
2820 /* Find the old active region end and shrink */
2821 for_each_active_range_index_in_nid(i, nid)
2822 if (early_node_map[i].end_pfn == old_end_pfn) {
2823 early_node_map[i].end_pfn = new_end_pfn;
2824 break;
2825 }
2826}
2827
2828/**
2829 * remove_all_active_ranges - Remove all currently registered regions
2830 *
2831 * During discovery, it may be found that a table like SRAT is invalid
2832 * and an alternative discovery method must be used. This function removes
2833 * all currently registered regions.
2834 */
2835void __init remove_all_active_ranges(void)
2836{
2837 memset(early_node_map, 0, sizeof(early_node_map));
2838 nr_nodemap_entries = 0;
2839#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2840 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2841 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2842#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2843}
2844
2845/* Compare two active node_active_regions */
2846static int __init cmp_node_active_region(const void *a, const void *b)
2847{
2848 struct node_active_region *arange = (struct node_active_region *)a;
2849 struct node_active_region *brange = (struct node_active_region *)b;
2850
2851 /* Done this way to avoid overflows */
2852 if (arange->start_pfn > brange->start_pfn)
2853 return 1;
2854 if (arange->start_pfn < brange->start_pfn)
2855 return -1;
2856
2857 return 0;
2858}
2859
2860/* sort the node_map by start_pfn */
2861static void __init sort_node_map(void)
2862{
2863 sort(early_node_map, (size_t)nr_nodemap_entries,
2864 sizeof(struct node_active_region),
2865 cmp_node_active_region, NULL);
2866}
2867
2868/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2869unsigned long __init find_min_pfn_for_node(unsigned long nid)
2870{
2871 int i;
2872
2873 /* Regions in the early_node_map can be in any order */
2874 sort_node_map();
2875
2876 /* Assuming a sorted map, the first range found has the starting pfn */
2877 for_each_active_range_index_in_nid(i, nid)
2878 return early_node_map[i].start_pfn;
2879
2880 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2881 return 0;
2882}
2883
2884/**
2885 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2886 *
2887 * It returns the minimum PFN based on information provided via
2888 * add_active_range().
2889 */
2890unsigned long __init find_min_pfn_with_active_regions(void)
2891{
2892 return find_min_pfn_for_node(MAX_NUMNODES);
2893}
2894
2895/**
2896 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2897 *
2898 * It returns the maximum PFN based on information provided via
2899 * add_active_range().
2900 */
2901unsigned long __init find_max_pfn_with_active_regions(void)
2902{
2903 int i;
2904 unsigned long max_pfn = 0;
2905
2906 for (i = 0; i < nr_nodemap_entries; i++)
2907 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2908
2909 return max_pfn;
2910}
2911
2912/**
2913 * free_area_init_nodes - Initialise all pg_data_t and zone data
2914 * @max_zone_pfn: an array of max PFNs for each zone
2915 *
2916 * This will call free_area_init_node() for each active node in the system.
2917 * Using the page ranges provided by add_active_range(), the size of each
2918 * zone in each node and their holes is calculated. If the maximum PFN
2919 * between two adjacent zones match, it is assumed that the zone is empty.
2920 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2921 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2922 * starts where the previous one ended. For example, ZONE_DMA32 starts
2923 * at arch_max_dma_pfn.
2924 */
2925void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2926{
2927 unsigned long nid;
2928 enum zone_type i;
2929
2930 /* Record where the zone boundaries are */
2931 memset(arch_zone_lowest_possible_pfn, 0,
2932 sizeof(arch_zone_lowest_possible_pfn));
2933 memset(arch_zone_highest_possible_pfn, 0,
2934 sizeof(arch_zone_highest_possible_pfn));
2935 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2936 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2937 for (i = 1; i < MAX_NR_ZONES; i++) {
2938 arch_zone_lowest_possible_pfn[i] =
2939 arch_zone_highest_possible_pfn[i-1];
2940 arch_zone_highest_possible_pfn[i] =
2941 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2942 }
2943
2944 /* Print out the zone ranges */
2945 printk("Zone PFN ranges:\n");
2946 for (i = 0; i < MAX_NR_ZONES; i++)
2947 printk(" %-8s %8lu -> %8lu\n",
2948 zone_names[i],
2949 arch_zone_lowest_possible_pfn[i],
2950 arch_zone_highest_possible_pfn[i]);
2951
2952 /* Print out the early_node_map[] */
2953 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2954 for (i = 0; i < nr_nodemap_entries; i++)
2955 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2956 early_node_map[i].start_pfn,
2957 early_node_map[i].end_pfn);
2958
2959 /* Initialise every node */
2960 for_each_online_node(nid) {
2961 pg_data_t *pgdat = NODE_DATA(nid);
2962 free_area_init_node(nid, pgdat, NULL,
2963 find_min_pfn_for_node(nid), NULL);
2964 }
2965}
2966#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2967
2968/**
2969 * set_dma_reserve - set the specified number of pages reserved in the first zone
2970 * @new_dma_reserve: The number of pages to mark reserved
2971 *
2972 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2973 * In the DMA zone, a significant percentage may be consumed by kernel image
2974 * and other unfreeable allocations which can skew the watermarks badly. This
2975 * function may optionally be used to account for unfreeable pages in the
2976 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2977 * smaller per-cpu batchsize.
2978 */
2979void __init set_dma_reserve(unsigned long new_dma_reserve)
2980{
2981 dma_reserve = new_dma_reserve;
2982}
2983
2095#ifndef CONFIG_NEED_MULTIPLE_NODES 2984#ifndef CONFIG_NEED_MULTIPLE_NODES
2096static bootmem_data_t contig_bootmem_data; 2985static bootmem_data_t contig_bootmem_data;
2097struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2986struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
@@ -2105,7 +2994,6 @@ void __init free_area_init(unsigned long *zones_size)
2105 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 2994 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
2106} 2995}
2107 2996
2108#ifdef CONFIG_HOTPLUG_CPU
2109static int page_alloc_cpu_notify(struct notifier_block *self, 2997static int page_alloc_cpu_notify(struct notifier_block *self,
2110 unsigned long action, void *hcpu) 2998 unsigned long action, void *hcpu)
2111{ 2999{
@@ -2120,7 +3008,6 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
2120 } 3008 }
2121 return NOTIFY_OK; 3009 return NOTIFY_OK;
2122} 3010}
2123#endif /* CONFIG_HOTPLUG_CPU */
2124 3011
2125void __init page_alloc_init(void) 3012void __init page_alloc_init(void)
2126{ 3013{
@@ -2198,10 +3085,11 @@ static void setup_per_zone_lowmem_reserve(void)
2198 calculate_totalreserve_pages(); 3085 calculate_totalreserve_pages();
2199} 3086}
2200 3087
2201/* 3088/**
2202 * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures 3089 * setup_per_zone_pages_min - called when min_free_kbytes changes.
2203 * that the pages_{min,low,high} values for each zone are set correctly 3090 *
2204 * with respect to min_free_kbytes. 3091 * Ensures that the pages_{min,low,high} values for each zone are set correctly
3092 * with respect to min_free_kbytes.
2205 */ 3093 */
2206void setup_per_zone_pages_min(void) 3094void setup_per_zone_pages_min(void)
2207{ 3095{
@@ -2423,7 +3311,7 @@ void *__init alloc_large_system_hash(const char *tablename,
2423 /* allow the kernel cmdline to have a say */ 3311 /* allow the kernel cmdline to have a say */
2424 if (!numentries) { 3312 if (!numentries) {
2425 /* round applicable memory size up to nearest megabyte */ 3313 /* round applicable memory size up to nearest megabyte */
2426 numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages; 3314 numentries = nr_kernel_pages;
2427 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3315 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
2428 numentries >>= 20 - PAGE_SHIFT; 3316 numentries >>= 20 - PAGE_SHIFT;
2429 numentries <<= 20 - PAGE_SHIFT; 3317 numentries <<= 20 - PAGE_SHIFT;
@@ -2445,7 +3333,7 @@ void *__init alloc_large_system_hash(const char *tablename,
2445 if (numentries > max) 3333 if (numentries > max)
2446 numentries = max; 3334 numentries = max;
2447 3335
2448 log2qty = long_log2(numentries); 3336 log2qty = ilog2(numentries);
2449 3337
2450 do { 3338 do {
2451 size = bucketsize << log2qty; 3339 size = bucketsize << log2qty;
@@ -2467,7 +3355,7 @@ void *__init alloc_large_system_hash(const char *tablename,
2467 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3355 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
2468 tablename, 3356 tablename,
2469 (1U << log2qty), 3357 (1U << log2qty),
2470 long_log2(size) - PAGE_SHIFT, 3358 ilog2(size) - PAGE_SHIFT,
2471 size); 3359 size);
2472 3360
2473 if (_hash_shift) 3361 if (_hash_shift)
@@ -2490,3 +3378,19 @@ unsigned long page_to_pfn(struct page *page)
2490EXPORT_SYMBOL(pfn_to_page); 3378EXPORT_SYMBOL(pfn_to_page);
2491EXPORT_SYMBOL(page_to_pfn); 3379EXPORT_SYMBOL(page_to_pfn);
2492#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3380#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3381
3382#if MAX_NUMNODES > 1
3383/*
3384 * Find the highest possible node id.
3385 */
3386int highest_possible_node_id(void)
3387{
3388 unsigned int node;
3389 unsigned int highest = 0;
3390
3391 for_each_node_mask(node, node_possible_map)
3392 highest = node;
3393 return highest;
3394}
3395EXPORT_SYMBOL(highest_possible_node_id);
3396#endif