diff options
Diffstat (limited to 'arch/x86/mm/discontig_32.c')
-rw-r--r-- | arch/x86/mm/discontig_32.c | 186 |
1 files changed, 95 insertions, 91 deletions
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 8b4eac0ca07d..a2f73ba42b8b 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <asm/setup.h> | 38 | #include <asm/setup.h> |
39 | #include <asm/mmzone.h> | 39 | #include <asm/mmzone.h> |
40 | #include <asm/bios_ebda.h> | 40 | #include <asm/bios_ebda.h> |
41 | #include <asm/proto.h> | ||
41 | 42 | ||
42 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 43 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; |
43 | EXPORT_SYMBOL(node_data); | 44 | EXPORT_SYMBOL(node_data); |
@@ -59,14 +60,14 @@ unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; | |||
59 | /* | 60 | /* |
60 | * 4) physnode_map - the mapping between a pfn and owning node | 61 | * 4) physnode_map - the mapping between a pfn and owning node |
61 | * physnode_map keeps track of the physical memory layout of a generic | 62 | * physnode_map keeps track of the physical memory layout of a generic |
62 | * numa node on a 256Mb break (each element of the array will | 63 | * numa node on a 64Mb break (each element of the array will |
63 | * represent 256Mb of memory and will be marked by the node id. so, | 64 | * represent 64Mb of memory and will be marked by the node id. so, |
64 | * if the first gig is on node 0, and the second gig is on node 1 | 65 | * if the first gig is on node 0, and the second gig is on node 1 |
65 | * physnode_map will contain: | 66 | * physnode_map will contain: |
66 | * | 67 | * |
67 | * physnode_map[0-3] = 0; | 68 | * physnode_map[0-15] = 0; |
68 | * physnode_map[4-7] = 1; | 69 | * physnode_map[16-31] = 1; |
69 | * physnode_map[8- ] = -1; | 70 | * physnode_map[32- ] = -1; |
70 | */ | 71 | */ |
71 | s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; | 72 | s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; |
72 | EXPORT_SYMBOL(physnode_map); | 73 | EXPORT_SYMBOL(physnode_map); |
@@ -81,9 +82,9 @@ void memory_present(int nid, unsigned long start, unsigned long end) | |||
81 | printk(KERN_DEBUG " "); | 82 | printk(KERN_DEBUG " "); |
82 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { | 83 | for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { |
83 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; | 84 | physnode_map[pfn / PAGES_PER_ELEMENT] = nid; |
84 | printk("%ld ", pfn); | 85 | printk(KERN_CONT "%ld ", pfn); |
85 | } | 86 | } |
86 | printk("\n"); | 87 | printk(KERN_CONT "\n"); |
87 | } | 88 | } |
88 | 89 | ||
89 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | 90 | unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, |
@@ -99,7 +100,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
99 | #endif | 100 | #endif |
100 | 101 | ||
101 | extern unsigned long find_max_low_pfn(void); | 102 | extern unsigned long find_max_low_pfn(void); |
102 | extern void add_one_highpage_init(struct page *, int, int); | ||
103 | extern unsigned long highend_pfn, highstart_pfn; | 103 | extern unsigned long highend_pfn, highstart_pfn; |
104 | 104 | ||
105 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | 105 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) |
@@ -119,11 +119,11 @@ int __init get_memcfg_numa_flat(void) | |||
119 | { | 119 | { |
120 | printk("NUMA - single node, flat memory mode\n"); | 120 | printk("NUMA - single node, flat memory mode\n"); |
121 | 121 | ||
122 | /* Run the memory configuration and find the top of memory. */ | ||
123 | propagate_e820_map(); | ||
124 | node_start_pfn[0] = 0; | 122 | node_start_pfn[0] = 0; |
125 | node_end_pfn[0] = max_pfn; | 123 | node_end_pfn[0] = max_pfn; |
124 | e820_register_active_regions(0, 0, max_pfn); | ||
126 | memory_present(0, 0, max_pfn); | 125 | memory_present(0, 0, max_pfn); |
126 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); | ||
127 | 127 | ||
128 | /* Indicate there is one node available. */ | 128 | /* Indicate there is one node available. */ |
129 | nodes_clear(node_online_map); | 129 | nodes_clear(node_online_map); |
@@ -159,9 +159,17 @@ static void __init allocate_pgdat(int nid) | |||
159 | if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) | 159 | if (nid && node_has_online_mem(nid) && node_remap_start_vaddr[nid]) |
160 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | 160 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; |
161 | else { | 161 | else { |
162 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(min_low_pfn)); | 162 | unsigned long pgdat_phys; |
163 | min_low_pfn += PFN_UP(sizeof(pg_data_t)); | 163 | pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, |
164 | (nid ? max_low_pfn:max_pfn_mapped)<<PAGE_SHIFT, | ||
165 | sizeof(pg_data_t), | ||
166 | PAGE_SIZE); | ||
167 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); | ||
168 | reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), | ||
169 | "NODE_DATA"); | ||
164 | } | 170 | } |
171 | printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", | ||
172 | nid, (unsigned long)NODE_DATA(nid)); | ||
165 | } | 173 | } |
166 | 174 | ||
167 | /* | 175 | /* |
@@ -199,8 +207,12 @@ void __init remap_numa_kva(void) | |||
199 | int node; | 207 | int node; |
200 | 208 | ||
201 | for_each_online_node(node) { | 209 | for_each_online_node(node) { |
210 | printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); | ||
202 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | 211 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { |
203 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | 212 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); |
213 | printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", | ||
214 | (unsigned long)vaddr, | ||
215 | node_remap_start_pfn[node] + pfn); | ||
204 | set_pmd_pfn((ulong) vaddr, | 216 | set_pmd_pfn((ulong) vaddr, |
205 | node_remap_start_pfn[node] + pfn, | 217 | node_remap_start_pfn[node] + pfn, |
206 | PAGE_KERNEL_LARGE); | 218 | PAGE_KERNEL_LARGE); |
@@ -212,17 +224,21 @@ static unsigned long calculate_numa_remap_pages(void) | |||
212 | { | 224 | { |
213 | int nid; | 225 | int nid; |
214 | unsigned long size, reserve_pages = 0; | 226 | unsigned long size, reserve_pages = 0; |
215 | unsigned long pfn; | ||
216 | 227 | ||
217 | for_each_online_node(nid) { | 228 | for_each_online_node(nid) { |
218 | unsigned old_end_pfn = node_end_pfn[nid]; | 229 | u64 node_kva_target; |
230 | u64 node_kva_final; | ||
219 | 231 | ||
220 | /* | 232 | /* |
221 | * The acpi/srat node info can show hot-add memroy zones | 233 | * The acpi/srat node info can show hot-add memroy zones |
222 | * where memory could be added but not currently present. | 234 | * where memory could be added but not currently present. |
223 | */ | 235 | */ |
236 | printk("node %d pfn: [%lx - %lx]\n", | ||
237 | nid, node_start_pfn[nid], node_end_pfn[nid]); | ||
224 | if (node_start_pfn[nid] > max_pfn) | 238 | if (node_start_pfn[nid] > max_pfn) |
225 | continue; | 239 | continue; |
240 | if (!node_end_pfn[nid]) | ||
241 | continue; | ||
226 | if (node_end_pfn[nid] > max_pfn) | 242 | if (node_end_pfn[nid] > max_pfn) |
227 | node_end_pfn[nid] = max_pfn; | 243 | node_end_pfn[nid] = max_pfn; |
228 | 244 | ||
@@ -234,39 +250,45 @@ static unsigned long calculate_numa_remap_pages(void) | |||
234 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | 250 | /* now the roundup is correct, convert to PAGE_SIZE pages */ |
235 | size = size * PTRS_PER_PTE; | 251 | size = size * PTRS_PER_PTE; |
236 | 252 | ||
237 | /* | 253 | node_kva_target = round_down(node_end_pfn[nid] - size, |
238 | * Validate the region we are allocating only contains valid | 254 | PTRS_PER_PTE); |
239 | * pages. | 255 | node_kva_target <<= PAGE_SHIFT; |
240 | */ | 256 | do { |
241 | for (pfn = node_end_pfn[nid] - size; | 257 | node_kva_final = find_e820_area(node_kva_target, |
242 | pfn < node_end_pfn[nid]; pfn++) | 258 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, |
243 | if (!page_is_ram(pfn)) | 259 | ((u64)size)<<PAGE_SHIFT, |
244 | break; | 260 | LARGE_PAGE_BYTES); |
261 | node_kva_target -= LARGE_PAGE_BYTES; | ||
262 | } while (node_kva_final == -1ULL && | ||
263 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); | ||
264 | |||
265 | if (node_kva_final == -1ULL) | ||
266 | panic("Can not get kva ram\n"); | ||
245 | 267 | ||
246 | if (pfn != node_end_pfn[nid]) | ||
247 | size = 0; | ||
248 | |||
249 | printk("Reserving %ld pages of KVA for lmem_map of node %d\n", | ||
250 | size, nid); | ||
251 | node_remap_size[nid] = size; | 268 | node_remap_size[nid] = size; |
252 | node_remap_offset[nid] = reserve_pages; | 269 | node_remap_offset[nid] = reserve_pages; |
253 | reserve_pages += size; | 270 | reserve_pages += size; |
254 | printk("Shrinking node %d from %ld pages to %ld pages\n", | 271 | printk("Reserving %ld pages of KVA for lmem_map of node %d at %llx\n", |
255 | nid, node_end_pfn[nid], node_end_pfn[nid] - size); | 272 | size, nid, node_kva_final>>PAGE_SHIFT); |
256 | 273 | ||
257 | if (node_end_pfn[nid] & (PTRS_PER_PTE-1)) { | 274 | /* |
258 | /* | 275 | * prevent kva address below max_low_pfn want it on system |
259 | * Align node_end_pfn[] and node_remap_start_pfn[] to | 276 | * with less memory later. |
260 | * pmd boundary. remap_numa_kva will barf otherwise. | 277 | * layout will be: KVA address , KVA RAM |
261 | */ | 278 | * |
262 | printk("Shrinking node %d further by %ld pages for proper alignment\n", | 279 | * we are supposed to only record the one less then max_low_pfn |
263 | nid, node_end_pfn[nid] & (PTRS_PER_PTE-1)); | 280 | * but we could have some hole in high memory, and it will only |
264 | size += node_end_pfn[nid] & (PTRS_PER_PTE-1); | 281 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide |
265 | } | 282 | * to use it as free. |
283 | * So reserve_early here, hope we don't run out of that array | ||
284 | */ | ||
285 | reserve_early(node_kva_final, | ||
286 | node_kva_final+(((u64)size)<<PAGE_SHIFT), | ||
287 | "KVA RAM"); | ||
266 | 288 | ||
267 | node_end_pfn[nid] -= size; | 289 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; |
268 | node_remap_start_pfn[nid] = node_end_pfn[nid]; | 290 | remove_active_range(nid, node_remap_start_pfn[nid], |
269 | shrink_active_range(nid, old_end_pfn, node_end_pfn[nid]); | 291 | node_remap_start_pfn[nid] + size); |
270 | } | 292 | } |
271 | printk("Reserving total of %ld pages for numa KVA remap\n", | 293 | printk("Reserving total of %ld pages for numa KVA remap\n", |
272 | reserve_pages); | 294 | reserve_pages); |
@@ -284,8 +306,7 @@ static void init_remap_allocator(int nid) | |||
284 | 306 | ||
285 | printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, | 307 | printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, |
286 | (ulong) node_remap_start_vaddr[nid], | 308 | (ulong) node_remap_start_vaddr[nid], |
287 | (ulong) pfn_to_kaddr(highstart_pfn | 309 | (ulong) node_remap_end_vaddr[nid]); |
288 | + node_remap_offset[nid] + node_remap_size[nid])); | ||
289 | } | 310 | } |
290 | 311 | ||
291 | extern void setup_bootmem_allocator(void); | 312 | extern void setup_bootmem_allocator(void); |
@@ -293,7 +314,7 @@ unsigned long __init setup_memory(void) | |||
293 | { | 314 | { |
294 | int nid; | 315 | int nid; |
295 | unsigned long system_start_pfn, system_max_low_pfn; | 316 | unsigned long system_start_pfn, system_max_low_pfn; |
296 | unsigned long wasted_pages; | 317 | long kva_target_pfn; |
297 | 318 | ||
298 | /* | 319 | /* |
299 | * When mapping a NUMA machine we allocate the node_mem_map arrays | 320 | * When mapping a NUMA machine we allocate the node_mem_map arrays |
@@ -302,34 +323,38 @@ unsigned long __init setup_memory(void) | |||
302 | * this space and use it to adjust the boundary between ZONE_NORMAL | 323 | * this space and use it to adjust the boundary between ZONE_NORMAL |
303 | * and ZONE_HIGHMEM. | 324 | * and ZONE_HIGHMEM. |
304 | */ | 325 | */ |
326 | |||
327 | /* call find_max_low_pfn at first, it could update max_pfn */ | ||
328 | system_max_low_pfn = max_low_pfn = find_max_low_pfn(); | ||
329 | |||
330 | remove_all_active_ranges(); | ||
305 | get_memcfg_numa(); | 331 | get_memcfg_numa(); |
306 | 332 | ||
307 | kva_pages = calculate_numa_remap_pages(); | 333 | kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); |
308 | 334 | ||
309 | /* partially used pages are not usable - thus round upwards */ | 335 | /* partially used pages are not usable - thus round upwards */ |
310 | system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); | 336 | system_start_pfn = min_low_pfn = PFN_UP(init_pg_tables_end); |
311 | 337 | ||
312 | kva_start_pfn = find_max_low_pfn() - kva_pages; | 338 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); |
313 | 339 | do { | |
314 | #ifdef CONFIG_BLK_DEV_INITRD | 340 | kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, |
315 | /* Numa kva area is below the initrd */ | 341 | max_low_pfn<<PAGE_SHIFT, |
316 | if (initrd_start) | 342 | kva_pages<<PAGE_SHIFT, |
317 | kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) | 343 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; |
318 | - kva_pages; | 344 | kva_target_pfn -= PTRS_PER_PTE; |
319 | #endif | 345 | } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); |
320 | 346 | ||
321 | /* | 347 | if (kva_start_pfn == -1UL) |
322 | * We waste pages past at the end of the KVA for no good reason other | 348 | panic("Can not get kva space\n"); |
323 | * than how it is located. This is bad. | ||
324 | */ | ||
325 | wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); | ||
326 | kva_start_pfn -= wasted_pages; | ||
327 | kva_pages += wasted_pages; | ||
328 | 349 | ||
329 | system_max_low_pfn = max_low_pfn = find_max_low_pfn(); | ||
330 | printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", | 350 | printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", |
331 | kva_start_pfn, max_low_pfn); | 351 | kva_start_pfn, max_low_pfn); |
332 | printk("max_pfn = %ld\n", max_pfn); | 352 | printk("max_pfn = %ld\n", max_pfn); |
353 | |||
354 | /* avoid clash with initrd */ | ||
355 | reserve_early(kva_start_pfn<<PAGE_SHIFT, | ||
356 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, | ||
357 | "KVA PG"); | ||
333 | #ifdef CONFIG_HIGHMEM | 358 | #ifdef CONFIG_HIGHMEM |
334 | highstart_pfn = highend_pfn = max_pfn; | 359 | highstart_pfn = highend_pfn = max_pfn; |
335 | if (max_pfn > system_max_low_pfn) | 360 | if (max_pfn > system_max_low_pfn) |
@@ -365,16 +390,8 @@ unsigned long __init setup_memory(void) | |||
365 | return max_low_pfn; | 390 | return max_low_pfn; |
366 | } | 391 | } |
367 | 392 | ||
368 | void __init numa_kva_reserve(void) | ||
369 | { | ||
370 | if (kva_pages) | ||
371 | reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages), | ||
372 | BOOTMEM_DEFAULT); | ||
373 | } | ||
374 | |||
375 | void __init zone_sizes_init(void) | 393 | void __init zone_sizes_init(void) |
376 | { | 394 | { |
377 | int nid; | ||
378 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 395 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
379 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 396 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
380 | max_zone_pfns[ZONE_DMA] = | 397 | max_zone_pfns[ZONE_DMA] = |
@@ -384,27 +401,18 @@ void __init zone_sizes_init(void) | |||
384 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | 401 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; |
385 | #endif | 402 | #endif |
386 | 403 | ||
387 | /* If SRAT has not registered memory, register it now */ | ||
388 | if (find_max_pfn_with_active_regions() == 0) { | ||
389 | for_each_online_node(nid) { | ||
390 | if (node_has_online_mem(nid)) | ||
391 | add_active_range(nid, node_start_pfn[nid], | ||
392 | node_end_pfn[nid]); | ||
393 | } | ||
394 | } | ||
395 | |||
396 | free_area_init_nodes(max_zone_pfns); | 404 | free_area_init_nodes(max_zone_pfns); |
397 | return; | 405 | return; |
398 | } | 406 | } |
399 | 407 | ||
400 | void __init set_highmem_pages_init(int bad_ppro) | 408 | void __init set_highmem_pages_init(void) |
401 | { | 409 | { |
402 | #ifdef CONFIG_HIGHMEM | 410 | #ifdef CONFIG_HIGHMEM |
403 | struct zone *zone; | 411 | struct zone *zone; |
404 | struct page *page; | 412 | int nid; |
405 | 413 | ||
406 | for_each_zone(zone) { | 414 | for_each_zone(zone) { |
407 | unsigned long node_pfn, zone_start_pfn, zone_end_pfn; | 415 | unsigned long zone_start_pfn, zone_end_pfn; |
408 | 416 | ||
409 | if (!is_highmem(zone)) | 417 | if (!is_highmem(zone)) |
410 | continue; | 418 | continue; |
@@ -412,16 +420,12 @@ void __init set_highmem_pages_init(int bad_ppro) | |||
412 | zone_start_pfn = zone->zone_start_pfn; | 420 | zone_start_pfn = zone->zone_start_pfn; |
413 | zone_end_pfn = zone_start_pfn + zone->spanned_pages; | 421 | zone_end_pfn = zone_start_pfn + zone->spanned_pages; |
414 | 422 | ||
423 | nid = zone_to_nid(zone); | ||
415 | printk("Initializing %s for node %d (%08lx:%08lx)\n", | 424 | printk("Initializing %s for node %d (%08lx:%08lx)\n", |
416 | zone->name, zone_to_nid(zone), | 425 | zone->name, nid, zone_start_pfn, zone_end_pfn); |
417 | zone_start_pfn, zone_end_pfn); | 426 | |
418 | 427 | add_highpages_with_active_regions(nid, zone_start_pfn, | |
419 | for (node_pfn = zone_start_pfn; node_pfn < zone_end_pfn; node_pfn++) { | 428 | zone_end_pfn); |
420 | if (!pfn_valid(node_pfn)) | ||
421 | continue; | ||
422 | page = pfn_to_page(node_pfn); | ||
423 | add_one_highpage_init(page, node_pfn, bad_ppro); | ||
424 | } | ||
425 | } | 429 | } |
426 | totalram_pages += totalhigh_pages; | 430 | totalram_pages += totalhigh_pages; |
427 | #endif | 431 | #endif |