aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2006-09-27 04:49:43 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-27 11:26:11 -0400
commitc713216deebd95d2b0ab38fef8bb2361c0180c2d (patch)
treea5a8c61be427e3591811ff712b9ec7ef2f1a1f20
parent2bd0cfbde2c0a74e209acbf045f1298cc2f61e01 (diff)
[PATCH] Introduce mechanism for registering active regions of memory
At a basic level, architectures define structures to record where active ranges of page frames are located. Once located, the code to calculate zone sizes and holes in each architecture is very similar. Some of this zone and hole sizing code is difficult to read for no good reason. This set of patches eliminates the similar-looking architecture-specific code. The patches introduce a mechanism where architectures register where the active ranges of page frames are with add_active_range(). When all areas have been discovered, free_area_init_nodes() is called to initialise the pgdat and zones. The zone sizes and holes are then calculated in an architecture independent manner. Patch 1 introduces the mechanism for registering and initialising PFN ranges Patch 2 changes ppc to use the mechanism - 139 arch-specific LOC removed Patch 3 changes x86 to use the mechanism - 136 arch-specific LOC removed Patch 4 changes x86_64 to use the mechanism - 74 arch-specific LOC removed Patch 5 changes ia64 to use the mechanism - 52 arch-specific LOC removed Patch 6 accounts for mem_map as a memory hole as the pages are not reclaimable. It adjusts the watermarks slightly Tony Luck has successfully tested for ia64 on Itanium with tiger_defconfig, gensparse_defconfig and defconfig. Bob Picco has also tested and debugged on IA64. Jack Steiner successfully boot tested on a mammoth SGI IA64-based machine. These were on patches against 2.6.17-rc1 and release 3 of these patches but there have been no ia64-changes since release 3. There are differences in the zone sizes for x86_64 as the arch-specific code for x86_64 accounts the kernel image and the starting mem_maps as memory holes but the architecture-independent code accounts the memory as present. The big benefit of this set of patches is a sizable reduction of architecture-specific code, some of which is very hairy. There should be a greater reduction when other architectures use the same mechanisms for zone and hole sizing but I lack the hardware to test on. Additional credit; Dave Hansen for the initial suggestion and comments on early patches Andy Whitcroft for reviewing early versions and catching numerous errors Tony Luck for testing and debugging on IA64 Bob Picco for fixing bugs related to pfn registration, reviewing a number of patch revisions, providing a number of suggestions on future direction and testing heavily Jack Steiner and Robin Holt for testing on IA64 and clarifying issues related to memory holes Yasunori for testing on IA64 Andi Kleen for reviewing and feeding back about x86_64 Christian Kujau for providing valuable information related to ACPI problems on x86_64 and testing potential fixes This patch: Define the structure to represent an active range of page frames within a node in an architecture independent manner. Architectures are expected to register active ranges of PFNs using add_active_range(nid, start_pfn, end_pfn) and call free_area_init_nodes() passing the PFNs of the end of each zone. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Bob Picco <bob.picco@hp.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Andy Whitcroft <apw@shadowen.org> Cc: Andi Kleen <ak@muc.de> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mackerras <paulus@samba.org> Cc: "Keith Mannthey" <kmannth@gmail.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/mm.h47
-rw-r--r--include/linux/mmzone.h10
-rw-r--r--mm/page_alloc.c552
3 files changed, 584 insertions, 25 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 856f0ee7e84a..c0402da7cce0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -937,6 +937,53 @@ extern void free_area_init(unsigned long * zones_size);
937extern void free_area_init_node(int nid, pg_data_t *pgdat, 937extern void free_area_init_node(int nid, pg_data_t *pgdat,
938 unsigned long * zones_size, unsigned long zone_start_pfn, 938 unsigned long * zones_size, unsigned long zone_start_pfn,
939 unsigned long *zholes_size); 939 unsigned long *zholes_size);
940#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
941/*
942 * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
943 * zones, allocate the backing mem_map and account for memory holes in a more
944 * architecture independent manner. This is a substitute for creating the
945 * zone_sizes[] and zholes_size[] arrays and passing them to
946 * free_area_init_node()
947 *
948 * An architecture is expected to register range of page frames backed by
949 * physical memory with add_active_range() before calling
950 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
951 * usage, an architecture is expected to do something like
952 *
953 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
954 * max_highmem_pfn};
955 * for_each_valid_physical_page_range()
956 * add_active_range(node_id, start_pfn, end_pfn)
957 * free_area_init_nodes(max_zone_pfns);
958 *
959 * If the architecture guarantees that there are no holes in the ranges
960 * registered with add_active_range(), free_bootmem_active_regions()
961 * will call free_bootmem_node() for each registered physical page range.
962 * Similarly sparse_memory_present_with_active_regions() calls
963 * memory_present() for each range when SPARSEMEM is enabled.
964 *
965 * See mm/page_alloc.c for more information on each function exposed by
966 * CONFIG_ARCH_POPULATES_NODE_MAP
967 */
968extern void free_area_init_nodes(unsigned long *max_zone_pfn);
969extern void add_active_range(unsigned int nid, unsigned long start_pfn,
970 unsigned long end_pfn);
971extern void shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
972 unsigned long new_end_pfn);
973extern void remove_all_active_ranges(void);
974extern unsigned long absent_pages_in_range(unsigned long start_pfn,
975 unsigned long end_pfn);
976extern void get_pfn_range_for_nid(unsigned int nid,
977 unsigned long *start_pfn, unsigned long *end_pfn);
978extern unsigned long find_min_pfn_with_active_regions(void);
979extern unsigned long find_max_pfn_with_active_regions(void);
980extern void free_bootmem_with_active_regions(int nid,
981 unsigned long max_low_pfn);
982extern void sparse_memory_present_with_active_regions(int nid);
983#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
984extern int early_pfn_to_nid(unsigned long pfn);
985#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
986#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
940extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); 987extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long);
941extern void setup_per_zone_pages_min(void); 988extern void setup_per_zone_pages_min(void);
942extern void mem_init(void); 989extern void mem_init(void);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3693f1a52788..7fa1cbe9fa7a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -305,6 +305,13 @@ struct zonelist {
305 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited 305 struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
306}; 306};
307 307
308#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
309struct node_active_region {
310 unsigned long start_pfn;
311 unsigned long end_pfn;
312 int nid;
313};
314#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
308 315
309/* 316/*
310 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM 317 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
@@ -518,7 +525,8 @@ extern struct zone *next_zone(struct zone *zone);
518 525
519#endif 526#endif
520 527
521#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 528#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
529 !defined(CONFIG_ARCH_POPULATES_NODE_MAP)
522#define early_pfn_to_nid(nid) (0UL) 530#define early_pfn_to_nid(nid) (0UL)
523#endif 531#endif
524 532
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9810f0a60db7..26c9939857fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,8 @@
37#include <linux/vmalloc.h> 37#include <linux/vmalloc.h>
38#include <linux/mempolicy.h> 38#include <linux/mempolicy.h>
39#include <linux/stop_machine.h> 39#include <linux/stop_machine.h>
40#include <linux/sort.h>
41#include <linux/pfn.h>
40 42
41#include <asm/tlbflush.h> 43#include <asm/tlbflush.h>
42#include <asm/div64.h> 44#include <asm/div64.h>
@@ -103,6 +105,33 @@ int min_free_kbytes = 1024;
103unsigned long __meminitdata nr_kernel_pages; 105unsigned long __meminitdata nr_kernel_pages;
104unsigned long __meminitdata nr_all_pages; 106unsigned long __meminitdata nr_all_pages;
105 107
108#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
109 /*
110 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
111 * ranges of memory (RAM) that may be registered with add_active_range().
112 * Ranges passed to add_active_range() will be merged if possible
113 * so the number of times add_active_range() can be called is
114 * related to the number of nodes and the number of holes
115 */
116 #ifdef CONFIG_MAX_ACTIVE_REGIONS
117 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
118 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
119 #else
120 #if MAX_NUMNODES >= 32
121 /* If there can be many nodes, allow up to 50 holes per node */
122 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
123 #else
124 /* By default, allow up to 256 distinct regions */
125 #define MAX_ACTIVE_REGIONS 256
126 #endif
127 #endif
128
129 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
130 int __initdata nr_nodemap_entries;
131 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
132 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
133#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
134
106#ifdef CONFIG_DEBUG_VM 135#ifdef CONFIG_DEBUG_VM
107static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 136static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
108{ 137{
@@ -1642,25 +1671,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
1642 1671
1643#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1672#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1644 1673
1645static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1646 unsigned long *zones_size, unsigned long *zholes_size)
1647{
1648 unsigned long realtotalpages, totalpages = 0;
1649 enum zone_type i;
1650
1651 for (i = 0; i < MAX_NR_ZONES; i++)
1652 totalpages += zones_size[i];
1653 pgdat->node_spanned_pages = totalpages;
1654
1655 realtotalpages = totalpages;
1656 if (zholes_size)
1657 for (i = 0; i < MAX_NR_ZONES; i++)
1658 realtotalpages -= zholes_size[i];
1659 pgdat->node_present_pages = realtotalpages;
1660 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
1661}
1662
1663
1664/* 1674/*
1665 * Initially all pages are reserved - free ones are freed 1675 * Initially all pages are reserved - free ones are freed
1666 * up by free_all_bootmem() once the early boot process is 1676 * up by free_all_bootmem() once the early boot process is
@@ -1977,6 +1987,272 @@ __meminit int init_currently_empty_zone(struct zone *zone,
1977 return 0; 1987 return 0;
1978} 1988}
1979 1989
1990#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1991/*
1992 * Basic iterator support. Return the first range of PFNs for a node
1993 * Note: nid == MAX_NUMNODES returns first region regardless of node
1994 */
1995static int __init first_active_region_index_in_nid(int nid)
1996{
1997 int i;
1998
1999 for (i = 0; i < nr_nodemap_entries; i++)
2000 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2001 return i;
2002
2003 return -1;
2004}
2005
2006/*
2007 * Basic iterator support. Return the next active range of PFNs for a node
2008 * Note: nid == MAX_NUMNODES returns next region regardles of node
2009 */
2010static int __init next_active_region_index_in_nid(int index, int nid)
2011{
2012 for (index = index + 1; index < nr_nodemap_entries; index++)
2013 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2014 return index;
2015
2016 return -1;
2017}
2018
2019#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2020/*
2021 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2022 * Architectures may implement their own version but if add_active_range()
2023 * was used and there are no special requirements, this is a convenient
2024 * alternative
2025 */
2026int __init early_pfn_to_nid(unsigned long pfn)
2027{
2028 int i;
2029
2030 for (i = 0; i < nr_nodemap_entries; i++) {
2031 unsigned long start_pfn = early_node_map[i].start_pfn;
2032 unsigned long end_pfn = early_node_map[i].end_pfn;
2033
2034 if (start_pfn <= pfn && pfn < end_pfn)
2035 return early_node_map[i].nid;
2036 }
2037
2038 return 0;
2039}
2040#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2041
2042/* Basic iterator support to walk early_node_map[] */
2043#define for_each_active_range_index_in_nid(i, nid) \
2044 for (i = first_active_region_index_in_nid(nid); i != -1; \
2045 i = next_active_region_index_in_nid(i, nid))
2046
2047/**
2048 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2049 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
2050 * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
2051 *
2052 * If an architecture guarantees that all ranges registered with
2053 * add_active_ranges() contain no holes and may be freed, this
2054 * this function may be used instead of calling free_bootmem() manually.
2055 */
2056void __init free_bootmem_with_active_regions(int nid,
2057 unsigned long max_low_pfn)
2058{
2059 int i;
2060
2061 for_each_active_range_index_in_nid(i, nid) {
2062 unsigned long size_pages = 0;
2063 unsigned long end_pfn = early_node_map[i].end_pfn;
2064
2065 if (early_node_map[i].start_pfn >= max_low_pfn)
2066 continue;
2067
2068 if (end_pfn > max_low_pfn)
2069 end_pfn = max_low_pfn;
2070
2071 size_pages = end_pfn - early_node_map[i].start_pfn;
2072 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2073 PFN_PHYS(early_node_map[i].start_pfn),
2074 size_pages << PAGE_SHIFT);
2075 }
2076}
2077
2078/**
2079 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2080 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
2081 *
2082 * If an architecture guarantees that all ranges registered with
2083 * add_active_ranges() contain no holes and may be freed, this
2084 * this function may be used instead of calling memory_present() manually.
2085 */
2086void __init sparse_memory_present_with_active_regions(int nid)
2087{
2088 int i;
2089
2090 for_each_active_range_index_in_nid(i, nid)
2091 memory_present(early_node_map[i].nid,
2092 early_node_map[i].start_pfn,
2093 early_node_map[i].end_pfn);
2094}
2095
2096/**
2097 * get_pfn_range_for_nid - Return the start and end page frames for a node
2098 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
2099 * @start_pfn: Passed by reference. On return, it will have the node start_pfn
2100 * @end_pfn: Passed by reference. On return, it will have the node end_pfn
2101 *
2102 * It returns the start and end page frame of a node based on information
2103 * provided by an arch calling add_active_range(). If called for a node
2104 * with no available memory, a warning is printed and the start and end
2105 * PFNs will be 0
2106 */
2107void __init get_pfn_range_for_nid(unsigned int nid,
2108 unsigned long *start_pfn, unsigned long *end_pfn)
2109{
2110 int i;
2111 *start_pfn = -1UL;
2112 *end_pfn = 0;
2113
2114 for_each_active_range_index_in_nid(i, nid) {
2115 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2116 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2117 }
2118
2119 if (*start_pfn == -1UL) {
2120 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2121 *start_pfn = 0;
2122 }
2123}
2124
2125/*
2126 * Return the number of pages a zone spans in a node, including holes
2127 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2128 */
2129unsigned long __init zone_spanned_pages_in_node(int nid,
2130 unsigned long zone_type,
2131 unsigned long *ignored)
2132{
2133 unsigned long node_start_pfn, node_end_pfn;
2134 unsigned long zone_start_pfn, zone_end_pfn;
2135
2136 /* Get the start and end of the node and zone */
2137 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2138 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2139 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2140
2141 /* Check that this node has pages within the zone's required range */
2142 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2143 return 0;
2144
2145 /* Move the zone boundaries inside the node if necessary */
2146 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2147 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2148
2149 /* Return the spanned pages */
2150 return zone_end_pfn - zone_start_pfn;
2151}
2152
2153/*
2154 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2155 * then all holes in the requested range will be accounted for
2156 */
2157unsigned long __init __absent_pages_in_range(int nid,
2158 unsigned long range_start_pfn,
2159 unsigned long range_end_pfn)
2160{
2161 int i = 0;
2162 unsigned long prev_end_pfn = 0, hole_pages = 0;
2163 unsigned long start_pfn;
2164
2165 /* Find the end_pfn of the first active range of pfns in the node */
2166 i = first_active_region_index_in_nid(nid);
2167 if (i == -1)
2168 return 0;
2169
2170 prev_end_pfn = early_node_map[i].start_pfn;
2171
2172 /* Find all holes for the zone within the node */
2173 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2174
2175 /* No need to continue if prev_end_pfn is outside the zone */
2176 if (prev_end_pfn >= range_end_pfn)
2177 break;
2178
2179 /* Make sure the end of the zone is not within the hole */
2180 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2181 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2182
2183 /* Update the hole size cound and move on */
2184 if (start_pfn > range_start_pfn) {
2185 BUG_ON(prev_end_pfn > start_pfn);
2186 hole_pages += start_pfn - prev_end_pfn;
2187 }
2188 prev_end_pfn = early_node_map[i].end_pfn;
2189 }
2190
2191 return hole_pages;
2192}
2193
2194/**
2195 * absent_pages_in_range - Return number of page frames in holes within a range
2196 * @start_pfn: The start PFN to start searching for holes
2197 * @end_pfn: The end PFN to stop searching for holes
2198 *
2199 * It returns the number of pages frames in memory holes within a range
2200 */
2201unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2202 unsigned long end_pfn)
2203{
2204 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2205}
2206
2207/* Return the number of page frames in holes in a zone on a node */
2208unsigned long __init zone_absent_pages_in_node(int nid,
2209 unsigned long zone_type,
2210 unsigned long *ignored)
2211{
2212 return __absent_pages_in_range(nid,
2213 arch_zone_lowest_possible_pfn[zone_type],
2214 arch_zone_highest_possible_pfn[zone_type]);
2215}
2216#else
2217static inline unsigned long zone_spanned_pages_in_node(int nid,
2218 unsigned long zone_type,
2219 unsigned long *zones_size)
2220{
2221 return zones_size[zone_type];
2222}
2223
2224static inline unsigned long zone_absent_pages_in_node(int nid,
2225 unsigned long zone_type,
2226 unsigned long *zholes_size)
2227{
2228 if (!zholes_size)
2229 return 0;
2230
2231 return zholes_size[zone_type];
2232}
2233#endif
2234
2235static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2236 unsigned long *zones_size, unsigned long *zholes_size)
2237{
2238 unsigned long realtotalpages, totalpages = 0;
2239 enum zone_type i;
2240
2241 for (i = 0; i < MAX_NR_ZONES; i++)
2242 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2243 zones_size);
2244 pgdat->node_spanned_pages = totalpages;
2245
2246 realtotalpages = totalpages;
2247 for (i = 0; i < MAX_NR_ZONES; i++)
2248 realtotalpages -=
2249 zone_absent_pages_in_node(pgdat->node_id, i,
2250 zholes_size);
2251 pgdat->node_present_pages = realtotalpages;
2252 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2253 realtotalpages);
2254}
2255
1980/* 2256/*
1981 * Set up the zone data structures: 2257 * Set up the zone data structures:
1982 * - mark all pages reserved 2258 * - mark all pages reserved
@@ -2000,9 +2276,9 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
2000 struct zone *zone = pgdat->node_zones + j; 2276 struct zone *zone = pgdat->node_zones + j;
2001 unsigned long size, realsize; 2277 unsigned long size, realsize;
2002 2278
2003 realsize = size = zones_size[j]; 2279 size = zone_spanned_pages_in_node(nid, j, zones_size);
2004 if (zholes_size) 2280 realsize = size - zone_absent_pages_in_node(nid, j,
2005 realsize -= zholes_size[j]; 2281 zholes_size);
2006 2282
2007 if (!is_highmem_idx(j)) 2283 if (!is_highmem_idx(j))
2008 nr_kernel_pages += realsize; 2284 nr_kernel_pages += realsize;
@@ -2073,8 +2349,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2073 /* 2349 /*
2074 * With no DISCONTIG, the global mem_map is just set as node 0's 2350 * With no DISCONTIG, the global mem_map is just set as node 0's
2075 */ 2351 */
2076 if (pgdat == NODE_DATA(0)) 2352 if (pgdat == NODE_DATA(0)) {
2077 mem_map = NODE_DATA(0)->node_mem_map; 2353 mem_map = NODE_DATA(0)->node_mem_map;
2354#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2355 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2356 mem_map -= pgdat->node_start_pfn;
2357#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2358 }
2078#endif 2359#endif
2079#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2360#endif /* CONFIG_FLAT_NODE_MEM_MAP */
2080} 2361}
@@ -2085,13 +2366,236 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2085{ 2366{
2086 pgdat->node_id = nid; 2367 pgdat->node_id = nid;
2087 pgdat->node_start_pfn = node_start_pfn; 2368 pgdat->node_start_pfn = node_start_pfn;
2088 calculate_zone_totalpages(pgdat, zones_size, zholes_size); 2369 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2089 2370
2090 alloc_node_mem_map(pgdat); 2371 alloc_node_mem_map(pgdat);
2091 2372
2092 free_area_init_core(pgdat, zones_size, zholes_size); 2373 free_area_init_core(pgdat, zones_size, zholes_size);
2093} 2374}
2094 2375
2376#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2377/**
2378 * add_active_range - Register a range of PFNs backed by physical memory
2379 * @nid: The node ID the range resides on
2380 * @start_pfn: The start PFN of the available physical memory
2381 * @end_pfn: The end PFN of the available physical memory
2382 *
2383 * These ranges are stored in an early_node_map[] and later used by
2384 * free_area_init_nodes() to calculate zone sizes and holes. If the
2385 * range spans a memory hole, it is up to the architecture to ensure
2386 * the memory is not freed by the bootmem allocator. If possible
2387 * the range being registered will be merged with existing ranges.
2388 */
2389void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2390 unsigned long end_pfn)
2391{
2392 int i;
2393
2394 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2395 "%d entries of %d used\n",
2396 nid, start_pfn, end_pfn,
2397 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2398
2399 /* Merge with existing active regions if possible */
2400 for (i = 0; i < nr_nodemap_entries; i++) {
2401 if (early_node_map[i].nid != nid)
2402 continue;
2403
2404 /* Skip if an existing region covers this new one */
2405 if (start_pfn >= early_node_map[i].start_pfn &&
2406 end_pfn <= early_node_map[i].end_pfn)
2407 return;
2408
2409 /* Merge forward if suitable */
2410 if (start_pfn <= early_node_map[i].end_pfn &&
2411 end_pfn > early_node_map[i].end_pfn) {
2412 early_node_map[i].end_pfn = end_pfn;
2413 return;
2414 }
2415
2416 /* Merge backward if suitable */
2417 if (start_pfn < early_node_map[i].end_pfn &&
2418 end_pfn >= early_node_map[i].start_pfn) {
2419 early_node_map[i].start_pfn = start_pfn;
2420 return;
2421 }
2422 }
2423
2424 /* Check that early_node_map is large enough */
2425 if (i >= MAX_ACTIVE_REGIONS) {
2426 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2427 MAX_ACTIVE_REGIONS);
2428 return;
2429 }
2430
2431 early_node_map[i].nid = nid;
2432 early_node_map[i].start_pfn = start_pfn;
2433 early_node_map[i].end_pfn = end_pfn;
2434 nr_nodemap_entries = i + 1;
2435}
2436
2437/**
2438 * shrink_active_range - Shrink an existing registered range of PFNs
2439 * @nid: The node id the range is on that should be shrunk
2440 * @old_end_pfn: The old end PFN of the range
2441 * @new_end_pfn: The new PFN of the range
2442 *
2443 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2444 * The map is kept at the end physical page range that has already been
2445 * registered with add_active_range(). This function allows an arch to shrink
2446 * an existing registered range.
2447 */
2448void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2449 unsigned long new_end_pfn)
2450{
2451 int i;
2452
2453 /* Find the old active region end and shrink */
2454 for_each_active_range_index_in_nid(i, nid)
2455 if (early_node_map[i].end_pfn == old_end_pfn) {
2456 early_node_map[i].end_pfn = new_end_pfn;
2457 break;
2458 }
2459}
2460
2461/**
2462 * remove_all_active_ranges - Remove all currently registered regions
2463 * During discovery, it may be found that a table like SRAT is invalid
2464 * and an alternative discovery method must be used. This function removes
2465 * all currently registered regions.
2466 */
2467void __init remove_all_active_ranges()
2468{
2469 memset(early_node_map, 0, sizeof(early_node_map));
2470 nr_nodemap_entries = 0;
2471}
2472
2473/* Compare two active node_active_regions */
2474static int __init cmp_node_active_region(const void *a, const void *b)
2475{
2476 struct node_active_region *arange = (struct node_active_region *)a;
2477 struct node_active_region *brange = (struct node_active_region *)b;
2478
2479 /* Done this way to avoid overflows */
2480 if (arange->start_pfn > brange->start_pfn)
2481 return 1;
2482 if (arange->start_pfn < brange->start_pfn)
2483 return -1;
2484
2485 return 0;
2486}
2487
2488/* sort the node_map by start_pfn */
2489static void __init sort_node_map(void)
2490{
2491 sort(early_node_map, (size_t)nr_nodemap_entries,
2492 sizeof(struct node_active_region),
2493 cmp_node_active_region, NULL);
2494}
2495
2496/* Find the lowest pfn for a node. This depends on a sorted early_node_map */
2497unsigned long __init find_min_pfn_for_node(unsigned long nid)
2498{
2499 int i;
2500
2501 /* Assuming a sorted map, the first range found has the starting pfn */
2502 for_each_active_range_index_in_nid(i, nid)
2503 return early_node_map[i].start_pfn;
2504
2505 printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid);
2506 return 0;
2507}
2508
2509/**
2510 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2511 *
2512 * It returns the minimum PFN based on information provided via
2513 * add_active_range()
2514 */
2515unsigned long __init find_min_pfn_with_active_regions(void)
2516{
2517 return find_min_pfn_for_node(MAX_NUMNODES);
2518}
2519
2520/**
2521 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2522 *
2523 * It returns the maximum PFN based on information provided via
2524 * add_active_range()
2525 */
2526unsigned long __init find_max_pfn_with_active_regions(void)
2527{
2528 int i;
2529 unsigned long max_pfn = 0;
2530
2531 for (i = 0; i < nr_nodemap_entries; i++)
2532 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2533
2534 return max_pfn;
2535}
2536
2537/**
2538 * free_area_init_nodes - Initialise all pg_data_t and zone data
2539 * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
2540 * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
2541 * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
2542 * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
2543 *
2544 * This will call free_area_init_node() for each active node in the system.
2545 * Using the page ranges provided by add_active_range(), the size of each
2546 * zone in each node and their holes is calculated. If the maximum PFN
2547 * between two adjacent zones match, it is assumed that the zone is empty.
2548 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2549 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2550 * starts where the previous one ended. For example, ZONE_DMA32 starts
2551 * at arch_max_dma_pfn.
2552 */
2553void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2554{
2555 unsigned long nid;
2556 enum zone_type i;
2557
2558 /* Record where the zone boundaries are */
2559 memset(arch_zone_lowest_possible_pfn, 0,
2560 sizeof(arch_zone_lowest_possible_pfn));
2561 memset(arch_zone_highest_possible_pfn, 0,
2562 sizeof(arch_zone_highest_possible_pfn));
2563 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2564 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2565 for (i = 1; i < MAX_NR_ZONES; i++) {
2566 arch_zone_lowest_possible_pfn[i] =
2567 arch_zone_highest_possible_pfn[i-1];
2568 arch_zone_highest_possible_pfn[i] =
2569 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2570 }
2571
2572 /* Regions in the early_node_map can be in any order */
2573 sort_node_map();
2574
2575 /* Print out the zone ranges */
2576 printk("Zone PFN ranges:\n");
2577 for (i = 0; i < MAX_NR_ZONES; i++)
2578 printk(" %-8s %8lu -> %8lu\n",
2579 zone_names[i],
2580 arch_zone_lowest_possible_pfn[i],
2581 arch_zone_highest_possible_pfn[i]);
2582
2583 /* Print out the early_node_map[] */
2584 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2585 for (i = 0; i < nr_nodemap_entries; i++)
2586 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2587 early_node_map[i].start_pfn,
2588 early_node_map[i].end_pfn);
2589
2590 /* Initialise every node */
2591 for_each_online_node(nid) {
2592 pg_data_t *pgdat = NODE_DATA(nid);
2593 free_area_init_node(nid, pgdat, NULL,
2594 find_min_pfn_for_node(nid), NULL);
2595 }
2596}
2597#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2598
2095#ifndef CONFIG_NEED_MULTIPLE_NODES 2599#ifndef CONFIG_NEED_MULTIPLE_NODES
2096static bootmem_data_t contig_bootmem_data; 2600static bootmem_data_t contig_bootmem_data;
2097struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2601struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };