aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/discontig_32.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/discontig_32.c')
-rw-r--r--arch/x86/mm/discontig_32.c186
1 files changed, 95 insertions, 91 deletions
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 8b4eac0ca07d..a2f73ba42b8b 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -38,6 +38,7 @@
38#include <asm/setup.h> 38#include <asm/setup.h>
39#include <asm/mmzone.h> 39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h> 40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
41 42
42struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
43EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
59/* 60/*
60 * 4) physnode_map - the mapping between a pfn and owning node 61 * 4) physnode_map - the mapping between a pfn and owning node
61 * physnode_map keeps track of the physical memory layout of a generic 62 * physnode_map keeps track of the physical memory layout of a generic
62 * numa node on a 256Mb break (each element of the array will 63 * numa node on a 64Mb break (each element of the array will
63 * represent 256Mb of memory and will be marked by the node id. so, 64 * represent 64Mb of memory and will be marked by the node id. so,
64 * if the first gig is on node 0, and the second gig is on node 1 65 * if the first gig is on node 0, and the second gig is on node 1
65 * physnode_map will contain: 66 * physnode_map will contain:
66 * 67 *
67 * physnode_map[0-3] = 0; 68 * physnode_map[0-15] = 0;
68 * physnode_map[4-7] = 1; 69 * physnode_map[16-31] = 1;
69 * physnode_map[8- ] = -1; 70 * physnode_map[32- ] = -1;
70 */ 71 */
71s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; 72s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
72EXPORT_SYMBOL(physnode_map); 73EXPORT_SYMBOL(physnode_map);
@@ -81,9 +82,9 @@ void memory_present(int nid, unsigned long start, unsigned long end)
81 printk(KERN_DEBUG " "); 82 printk(KERN_DEBUG " ");
82 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { 83 for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
83 physnode_map[pfn / PAGES_PER_ELEMENT] = nid; 84 physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
84 printk("%ld ", pfn); 85 printk(KERN_CONT "%ld ", pfn);
85 } 86 }
86 printk("\n"); 87 printk(KERN_CONT "\n");
87} 88}
88 89
89unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, 90unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99#endif 100#endif
100 101
101extern unsigned long find_max_low_pfn(void); 102extern unsigned long find_max_low_pfn(void);
102extern void add_one_highpage_init(struct page *, int, int);
103extern unsigned long highend_pfn, highstart_pfn; 103extern unsigned long highend_pfn, highstart_pfn;
104 104
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -119,11 +119,11 @@ int __init get_memcfg_numa_flat(void)
119{ 119{
120 printk("NUMA - single node, flat memory mode\n"); 120 printk("NUMA - single node, flat memory mode\n");
121 121
122 /* Run the memory configuration and find the top of memory. */
123 propagate_e820_map();
124 node_start_pfn[0] = 0; 122 node_start_pfn[0] = 0;
125 node_end_pfn[0] = max_pfn; 123 node_end_pfn[0] = max_pfn;
124 e820_register_active_regions(0, 0, max_pfn);
126 memory_present(0, 0, max_pfn); 125 memory_present(0, 0, max_pfn);
126 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
127 127
128 /* Indicate there is one node available. */ 128 /* Indicate there is one node available. */
129 nodes_clear(node_online_map); 129 nodes_clear(node_online_map);
@@ -159,9 +159,17 @@ static void __init allocate_pgdat(int nid)
159 if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) 159 if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid])
160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; 160 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
161 else { 161 else {
162 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); 162 unsigned long pgdat_phys;
163 min_low_pfn += PFN_UP(sizeof(pg_data_t)); 163 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
164 (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT,
165 sizeof(pg_data_t),
166 PAGE_SIZE);
167 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
168 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t),
169 "NODE_DATA");
164 } 170 }
171 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
172 nid, (unsigned long)NODE_DATA(nid));
165} 173}
166 174
167/* 175/*
@@ -199,8 +207,12 @@ void __init remap_numa_kva(void)
199 int node; 207 int node;
200 208
201 for_each_online_node(node) { 209 for_each_online_node(node) {
210 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
202 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { 211 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
203 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); 212 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
213 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
214 (unsigned long)vaddr,
215 node_remap_start_pfn[node] + pfn);
204 set_pmd_pfn((ulong) vaddr, 216 set_pmd_pfn((ulong) vaddr,
205 node_remap_start_pfn[node] + pfn, 217 node_remap_start_pfn[node] + pfn,
206 PAGE_KERNEL_LARGE); 218 PAGE_KERNEL_LARGE);
@@ -212,17 +224,21 @@ static unsigned long calculate_numa_remap_pages(void)
212{ 224{
213 int nid; 225 int nid;
214 unsigned long size, reserve_pages = 0; 226 unsigned long size, reserve_pages = 0;
215 unsigned long pfn;
216 227
217 for_each_online_node(nid) { 228 for_each_online_node(nid) {
218 unsigned old_end_pfn = node_end_pfn[nid]; 229 u64 node_kva_target;
230 u64 node_kva_final;
219 231
220 /* 232 /*
221 * The acpi/srat node info can show hot-add memroy zones 233 * The acpi/srat node info can show hot-add memroy zones
222 * where memory could be added but not currently present. 234 * where memory could be added but not currently present.
223 */ 235 */
236 printk("node %d pfn: [%lx - %lx]\n",
237 nid, node_start_pfn[nid], node_end_pfn[nid]);
224 if (node_start_pfn[nid] > max_pfn) 238 if (node_start_pfn[nid] > max_pfn)
225 continue; 239 continue;
240 if (!node_end_pfn[nid])
241 continue;
226 if (node_end_pfn[nid] > max_pfn) 242 if (node_end_pfn[nid] > max_pfn)
227 node_end_pfn[nid] = max_pfn; 243 node_end_pfn[nid] = max_pfn;
228 244
@@ -234,39 +250,45 @@ static unsigned long calculate_numa_remap_pages(void)
234 /* now the roundup is correct, convert to PAGE_SIZE pages */ 250 /* now the roundup is correct, convert to PAGE_SIZE pages */
235 size = size * PTRS_PER_PTE; 251 size = size * PTRS_PER_PTE;
236 252
237 /* 253 node_kva_target = round_down(node_end_pfn[nid] - size,
238 * Validate the region we are allocating only contains valid 254 PTRS_PER_PTE);
239 * pages. 255 node_kva_target <<= PAGE_SHIFT;
240 */ 256 do {
241 for (pfn = node_end_pfn[nid] - size; 257 node_kva_final = find_e820_area(node_kva_target,
242 pfn < node_end_pfn[nid]; pfn++) 258 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
243 if (!page_is_ram(pfn)) 259 ((u64)size)<<PAGE_SHIFT,
244 break; 260 LARGE_PAGE_BYTES);
261 node_kva_target -= LARGE_PAGE_BYTES;
262 } while (node_kva_final == -1ULL &&
263 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
264
265 if (node_kva_final == -1ULL)
266 panic("Can not get kva ram\n");
245 267
246 if (pfn != node_end_pfn[nid])
247 size = 0;
248
249 printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
250 size, nid);
251 node_remap_size[nid] = size; 268 node_remap_size[nid] = size;
252 node_remap_offset[nid] = reserve_pages; 269 node_remap_offset[nid] = reserve_pages;
253 reserve_pages += size; 270 reserve_pages += size;
254 printk("Shrinking node %d from %ld pages to %ld pages\n", 271 printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
255 nid, node_end_pfn[nid], node_end_pfn[nid] - size); 272 size, nid, node_kva_final>>PAGE_SHIFT);
256 273
257 if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { 274 /*
258 /* 275 * prevent kva address below max_low_pfn want it on system
259 * Align node_end_pfn[] and node_remap_start_pfn[] to 276 * with less memory later.
260 * pmd boundary. remap_numa_kva will barf otherwise. 277 * layout will be: KVA address , KVA RAM
261 */ 278 *
262 printk("Shrinking node %d further by %ld pages for proper alignment\n", 279 * we are supposed to only record the one less then max_low_pfn
263 nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); 280 * but we could have some hole in high memory, and it will only
264 size += node_end_pfn[nid] & (PTRS_PER_PTE-1); 281 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
265 } 282 * to use it as free.
283 * So reserve_early here, hope we don't run out of that array
284 */
285 reserve_early(node_kva_final,
286 node_kva_final+(((u64)size)<<PAGE_SHIFT),
287 "KVA RAM");
266 288
267 node_end_pfn[nid] -= size; 289 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
268 node_remap_start_pfn[nid] = node_end_pfn[nid]; 290 remove_active_range(nid, node_remap_start_pfn[nid],
269 shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); 291 node_remap_start_pfn[nid] + size);
270 } 292 }
271 printk("Reserving total of %ld pages for numa KVA remap\n", 293 printk("Reserving total of %ld pages for numa KVA remap\n",
272 reserve_pages); 294 reserve_pages);
@@ -284,8 +306,7 @@ static void init_remap_allocator(int nid)
284 306
285 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, 307 printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
286 (ulong) node_remap_start_vaddr[nid], 308 (ulong) node_remap_start_vaddr[nid],
287 (ulong) pfn_to_kaddr(highstart_pfn 309 (ulong) node_remap_end_vaddr[nid]);
288 + node_remap_offset[nid] + node_remap_size[nid]));
289} 310}
290 311
291extern void setup_bootmem_allocator(void); 312extern void setup_bootmem_allocator(void);
@@ -293,7 +314,7 @@ unsigned long __init setup_memory(void)
293{ 314{
294 int nid; 315 int nid;
295 unsigned long system_start_pfn, system_max_low_pfn; 316 unsigned long system_start_pfn, system_max_low_pfn;
296 unsigned long wasted_pages; 317 long kva_target_pfn;
297 318
298 /* 319 /*
299 * When mapping a NUMA machine we allocate the node_mem_map arrays 320 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -302,34 +323,38 @@ unsigned long __init setup_memory(void)
302 * this space and use it to adjust the boundary between ZONE_NORMAL 323 * this space and use it to adjust the boundary between ZONE_NORMAL
303 * and ZONE_HIGHMEM. 324 * and ZONE_HIGHMEM.
304 */ 325 */
326
327 /* call find_max_low_pfn at first, it could update max_pfn */
328 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
329
330 remove_all_active_ranges();
305 get_memcfg_numa(); 331 get_memcfg_numa();
306 332
307 kva_pages = calculate_numa_remap_pages(); 333 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE);
308 334
309 /* partially used pages are not usable - thus round upwards */ 335 /* partially used pages are not usable - thus round upwards */
310 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); 336 system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end);
311 337
312 kva_start_pfn = find_max_low_pfn() - kva_pages; 338 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
313 339 do {
314#ifdef CONFIG_BLK_DEV_INITRD 340 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
315 /* Numa kva area is below the initrd */ 341 max_low_pfn<<PAGE_SHIFT,
316 if (initrd_start) 342 kva_pages<<PAGE_SHIFT,
317 kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) 343 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
318 - kva_pages; 344 kva_target_pfn -= PTRS_PER_PTE;
319#endif 345 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
320 346
321 /* 347 if (kva_start_pfn == -1UL)
322 * We waste pages past at the end of the KVA for no good reason other 348 panic("Can not get kva space\n");
323 * than how it is located. This is bad.
324 */
325 wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1);
326 kva_start_pfn -= wasted_pages;
327 kva_pages += wasted_pages;
328 349
329 system_max_low_pfn = max_low_pfn = find_max_low_pfn();
330 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", 350 printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n",
331 kva_start_pfn, max_low_pfn); 351 kva_start_pfn, max_low_pfn);
332 printk("max_pfn = %ld\n", max_pfn); 352 printk("max_pfn = %ld\n", max_pfn);
353
354 /* avoid clash with initrd */
355 reserve_early(kva_start_pfn<<PAGE_SHIFT,
356 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
357 "KVA PG");
333#ifdef CONFIG_HIGHMEM 358#ifdef CONFIG_HIGHMEM
334 highstart_pfn = highend_pfn = max_pfn; 359 highstart_pfn = highend_pfn = max_pfn;
335 if (max_pfn > system_max_low_pfn) 360 if (max_pfn > system_max_low_pfn)
@@ -365,16 +390,8 @@ unsigned long __init setup_memory(void)
365 return max_low_pfn; 390 return max_low_pfn;
366} 391}
367 392
368void __init numa_kva_reserve(void)
369{
370 if (kva_pages)
371 reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages),
372 BOOTMEM_DEFAULT);
373}
374
375void __init zone_sizes_init(void) 393void __init zone_sizes_init(void)
376{ 394{
377 int nid;
378 unsigned long max_zone_pfns[MAX_NR_ZONES]; 395 unsigned long max_zone_pfns[MAX_NR_ZONES];
379 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 396 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
380 max_zone_pfns[ZONE_DMA] = 397 max_zone_pfns[ZONE_DMA] =
@@ -384,27 +401,18 @@ void __init zone_sizes_init(void)
384 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 401 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
385#endif 402#endif
386 403
387 /* If SRAT has not registered memory, register it now */
388 if (find_max_pfn_with_active_regions() == 0) {
389 for_each_online_node(nid) {
390 if (node_has_online_mem(nid))
391 add_active_range(nid, node_start_pfn[nid],
392 node_end_pfn[nid]);
393 }
394 }
395
396 free_area_init_nodes(max_zone_pfns); 404 free_area_init_nodes(max_zone_pfns);
397 return; 405 return;
398} 406}
399 407
400void __init set_highmem_pages_init(int bad_ppro) 408void __init set_highmem_pages_init(void)
401{ 409{
402#ifdef CONFIG_HIGHMEM 410#ifdef CONFIG_HIGHMEM
403 struct zone *zone; 411 struct zone *zone;
404 struct page *page; 412 int nid;
405 413
406 for_each_zone(zone) { 414 for_each_zone(zone) {
407 unsigned long node_pfn, zone_start_pfn, zone_end_pfn; 415 unsigned long zone_start_pfn, zone_end_pfn;
408 416
409 if (!is_highmem(zone)) 417 if (!is_highmem(zone))
410 continue; 418 continue;
@@ -412,16 +420,12 @@ void __init set_highmem_pages_init(int bad_ppro)
412 zone_start_pfn = zone->zone_start_pfn; 420 zone_start_pfn = zone->zone_start_pfn;
413 zone_end_pfn = zone_start_pfn + zone->spanned_pages; 421 zone_end_pfn = zone_start_pfn + zone->spanned_pages;
414 422
423 nid = zone_to_nid(zone);
415 printk("Initializing %s for node %d (%08lx:%08lx)\n", 424 printk("Initializing %s for node %d (%08lx:%08lx)\n",
416 zone->name, zone_to_nid(zone), 425 zone->name, nid, zone_start_pfn, zone_end_pfn);
417 zone_start_pfn, zone_end_pfn); 426
418 427 add_highpages_with_active_regions(nid, zone_start_pfn,
419 for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { 428 zone_end_pfn);
420 if (!pfn_valid(node_pfn))
421 continue;
422 page = pfn_to_page(node_pfn);
423 add_one_highpage_init(page, node_pfn, bad_ppro);
424 }
425 } 429 }
426 totalram_pages += totalhigh_pages; 430 totalram_pages += totalhigh_pages;
427#endif 431#endif