diff options
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r-- | arch/x86/mm/numa_64.c | 506 |
1 files changed, 328 insertions, 178 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 459913beac71..8948f47fde05 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start, | |||
163 | unsigned long end, unsigned long size, | 163 | unsigned long end, unsigned long size, |
164 | unsigned long align) | 164 | unsigned long align) |
165 | { | 165 | { |
166 | unsigned long mem = find_e820_area(start, end, size, align); | 166 | unsigned long mem; |
167 | void *ptr; | ||
168 | 167 | ||
168 | /* | ||
169 | * put it on high as possible | ||
170 | * something will go with NODE_DATA | ||
171 | */ | ||
172 | if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) | ||
173 | start = MAX_DMA_PFN<<PAGE_SHIFT; | ||
174 | if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && | ||
175 | end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | ||
176 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | ||
177 | mem = find_e820_area(start, end, size, align); | ||
178 | if (mem != -1L) | ||
179 | return __va(mem); | ||
180 | |||
181 | /* extend the search scope */ | ||
182 | end = max_pfn_mapped << PAGE_SHIFT; | ||
183 | if (end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | ||
184 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | ||
185 | else | ||
186 | start = MAX_DMA_PFN<<PAGE_SHIFT; | ||
187 | mem = find_e820_area(start, end, size, align); | ||
169 | if (mem != -1L) | 188 | if (mem != -1L) |
170 | return __va(mem); | 189 | return __va(mem); |
171 | 190 | ||
172 | ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | 191 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", |
173 | if (ptr == NULL) { | ||
174 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | ||
175 | size, nodeid); | 192 | size, nodeid); |
176 | return NULL; | 193 | |
177 | } | 194 | return NULL; |
178 | return ptr; | ||
179 | } | 195 | } |
180 | 196 | ||
181 | /* Initialize bootmem allocator for a node */ | 197 | /* Initialize bootmem allocator for a node */ |
182 | void __init | 198 | void __init |
183 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | 199 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) |
184 | { | 200 | { |
185 | unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; | 201 | unsigned long start_pfn, last_pfn, nodedata_phys; |
186 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); | 202 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); |
187 | unsigned long bootmap_start, nodedata_phys; | ||
188 | void *bootmap; | ||
189 | int nid; | 203 | int nid; |
204 | #ifndef CONFIG_NO_BOOTMEM | ||
205 | unsigned long bootmap_start, bootmap_pages, bootmap_size; | ||
206 | void *bootmap; | ||
207 | #endif | ||
190 | 208 | ||
191 | if (!end) | 209 | if (!end) |
192 | return; | 210 | return; |
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
200 | 218 | ||
201 | start = roundup(start, ZONE_ALIGN); | 219 | start = roundup(start, ZONE_ALIGN); |
202 | 220 | ||
203 | printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, | 221 | printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, |
204 | start, end); | 222 | start, end); |
205 | 223 | ||
206 | start_pfn = start >> PAGE_SHIFT; | 224 | start_pfn = start >> PAGE_SHIFT; |
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
211 | if (node_data[nodeid] == NULL) | 229 | if (node_data[nodeid] == NULL) |
212 | return; | 230 | return; |
213 | nodedata_phys = __pa(node_data[nodeid]); | 231 | nodedata_phys = __pa(node_data[nodeid]); |
232 | reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); | ||
214 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, | 233 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, |
215 | nodedata_phys + pgdat_size - 1); | 234 | nodedata_phys + pgdat_size - 1); |
235 | nid = phys_to_nid(nodedata_phys); | ||
236 | if (nid != nodeid) | ||
237 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); | ||
216 | 238 | ||
217 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | 239 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); |
218 | NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; | 240 | NODE_DATA(nodeid)->node_id = nodeid; |
219 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | 241 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; |
220 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; | 242 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; |
221 | 243 | ||
244 | #ifndef CONFIG_NO_BOOTMEM | ||
245 | NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; | ||
246 | |||
222 | /* | 247 | /* |
223 | * Find a place for the bootmem map | 248 | * Find a place for the bootmem map |
224 | * nodedata_phys could be on other nodes by alloc_bootmem, | 249 | * nodedata_phys could be on other nodes by alloc_bootmem, |
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
227 | * of alloc_bootmem, that could clash with reserved range | 252 | * of alloc_bootmem, that could clash with reserved range |
228 | */ | 253 | */ |
229 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); | 254 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); |
230 | nid = phys_to_nid(nodedata_phys); | 255 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); |
231 | if (nid == nodeid) | ||
232 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
233 | else | ||
234 | bootmap_start = roundup(start, PAGE_SIZE); | ||
235 | /* | 256 | /* |
236 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like | 257 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like |
237 | * to use that to align to PAGE_SIZE | 258 | * to use that to align to PAGE_SIZE |
@@ -239,12 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
239 | bootmap = early_node_mem(nodeid, bootmap_start, end, | 260 | bootmap = early_node_mem(nodeid, bootmap_start, end, |
240 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); | 261 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); |
241 | if (bootmap == NULL) { | 262 | if (bootmap == NULL) { |
242 | if (nodedata_phys < start || nodedata_phys >= end) | 263 | free_early(nodedata_phys, nodedata_phys + pgdat_size); |
243 | free_bootmem(nodedata_phys, pgdat_size); | ||
244 | node_data[nodeid] = NULL; | 264 | node_data[nodeid] = NULL; |
245 | return; | 265 | return; |
246 | } | 266 | } |
247 | bootmap_start = __pa(bootmap); | 267 | bootmap_start = __pa(bootmap); |
268 | reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT), | ||
269 | "BOOTMAP"); | ||
248 | 270 | ||
249 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | 271 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), |
250 | bootmap_start >> PAGE_SHIFT, | 272 | bootmap_start >> PAGE_SHIFT, |
@@ -253,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | |||
253 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | 275 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", |
254 | bootmap_start, bootmap_start + bootmap_size - 1, | 276 | bootmap_start, bootmap_start + bootmap_size - 1, |
255 | bootmap_pages); | 277 | bootmap_pages); |
256 | |||
257 | free_bootmem_with_active_regions(nodeid, end); | ||
258 | |||
259 | /* | ||
260 | * convert early reserve to bootmem reserve earlier | ||
261 | * otherwise early_node_mem could use early reserved mem | ||
262 | * on previous node | ||
263 | */ | ||
264 | early_res_to_bootmem(start, end); | ||
265 | |||
266 | /* | ||
267 | * in some case early_node_mem could use alloc_bootmem | ||
268 | * to get range on other node, don't reserve that again | ||
269 | */ | ||
270 | if (nid != nodeid) | ||
271 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); | ||
272 | else | ||
273 | reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, | ||
274 | pgdat_size, BOOTMEM_DEFAULT); | ||
275 | nid = phys_to_nid(bootmap_start); | 278 | nid = phys_to_nid(bootmap_start); |
276 | if (nid != nodeid) | 279 | if (nid != nodeid) |
277 | printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); | 280 | printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); |
278 | else | 281 | |
279 | reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, | 282 | free_bootmem_with_active_regions(nodeid, end); |
280 | bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); | 283 | #endif |
281 | 284 | ||
282 | node_set_online(nodeid); | 285 | node_set_online(nodeid); |
283 | } | 286 | } |
@@ -306,8 +309,71 @@ void __init numa_init_array(void) | |||
306 | 309 | ||
307 | #ifdef CONFIG_NUMA_EMU | 310 | #ifdef CONFIG_NUMA_EMU |
308 | /* Numa emulation */ | 311 | /* Numa emulation */ |
312 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
313 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | ||
309 | static char *cmdline __initdata; | 314 | static char *cmdline __initdata; |
310 | 315 | ||
316 | static int __init setup_physnodes(unsigned long start, unsigned long end, | ||
317 | int acpi, int k8) | ||
318 | { | ||
319 | int nr_nodes = 0; | ||
320 | int ret = 0; | ||
321 | int i; | ||
322 | |||
323 | #ifdef CONFIG_ACPI_NUMA | ||
324 | if (acpi) | ||
325 | nr_nodes = acpi_get_nodes(physnodes); | ||
326 | #endif | ||
327 | #ifdef CONFIG_K8_NUMA | ||
328 | if (k8) | ||
329 | nr_nodes = k8_get_nodes(physnodes); | ||
330 | #endif | ||
331 | /* | ||
332 | * Basic sanity checking on the physical node map: there may be errors | ||
333 | * if the SRAT or K8 incorrectly reported the topology or the mem= | ||
334 | * kernel parameter is used. | ||
335 | */ | ||
336 | for (i = 0; i < nr_nodes; i++) { | ||
337 | if (physnodes[i].start == physnodes[i].end) | ||
338 | continue; | ||
339 | if (physnodes[i].start > end) { | ||
340 | physnodes[i].end = physnodes[i].start; | ||
341 | continue; | ||
342 | } | ||
343 | if (physnodes[i].end < start) { | ||
344 | physnodes[i].start = physnodes[i].end; | ||
345 | continue; | ||
346 | } | ||
347 | if (physnodes[i].start < start) | ||
348 | physnodes[i].start = start; | ||
349 | if (physnodes[i].end > end) | ||
350 | physnodes[i].end = end; | ||
351 | } | ||
352 | |||
353 | /* | ||
354 | * Remove all nodes that have no memory or were truncated because of the | ||
355 | * limited address range. | ||
356 | */ | ||
357 | for (i = 0; i < nr_nodes; i++) { | ||
358 | if (physnodes[i].start == physnodes[i].end) | ||
359 | continue; | ||
360 | physnodes[ret].start = physnodes[i].start; | ||
361 | physnodes[ret].end = physnodes[i].end; | ||
362 | ret++; | ||
363 | } | ||
364 | |||
365 | /* | ||
366 | * If no physical topology was detected, a single node is faked to cover | ||
367 | * the entire address space. | ||
368 | */ | ||
369 | if (!ret) { | ||
370 | physnodes[ret].start = start; | ||
371 | physnodes[ret].end = end; | ||
372 | ret = 1; | ||
373 | } | ||
374 | return ret; | ||
375 | } | ||
376 | |||
311 | /* | 377 | /* |
312 | * Setups up nid to range from addr to addr + size. If the end | 378 | * Setups up nid to range from addr to addr + size. If the end |
313 | * boundary is greater than max_addr, then max_addr is used instead. | 379 | * boundary is greater than max_addr, then max_addr is used instead. |
@@ -315,11 +381,9 @@ static char *cmdline __initdata; | |||
315 | * allocation past addr and -1 otherwise. addr is adjusted to be at | 381 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
316 | * the end of the node. | 382 | * the end of the node. |
317 | */ | 383 | */ |
318 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 384 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) |
319 | u64 size, u64 max_addr) | ||
320 | { | 385 | { |
321 | int ret = 0; | 386 | int ret = 0; |
322 | |||
323 | nodes[nid].start = *addr; | 387 | nodes[nid].start = *addr; |
324 | *addr += size; | 388 | *addr += size; |
325 | if (*addr >= max_addr) { | 389 | if (*addr >= max_addr) { |
@@ -335,167 +399,234 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | |||
335 | } | 399 | } |
336 | 400 | ||
337 | /* | 401 | /* |
338 | * Splits num_nodes nodes up equally starting at node_start. The return value | 402 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr |
339 | * is the number of nodes split up and addr is adjusted to be at the end of the | 403 | * to max_addr. The return value is the number of nodes allocated. |
340 | * last node allocated. | ||
341 | */ | 404 | */ |
342 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | 405 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, |
343 | u64 max_addr, int node_start, | 406 | int nr_phys_nodes, int nr_nodes) |
344 | int num_nodes) | ||
345 | { | 407 | { |
346 | unsigned int big; | 408 | nodemask_t physnode_mask = NODE_MASK_NONE; |
347 | u64 size; | 409 | u64 size; |
410 | int big; | ||
411 | int ret = 0; | ||
348 | int i; | 412 | int i; |
349 | 413 | ||
350 | if (num_nodes <= 0) | 414 | if (nr_nodes <= 0) |
351 | return -1; | 415 | return -1; |
352 | if (num_nodes > MAX_NUMNODES) | 416 | if (nr_nodes > MAX_NUMNODES) { |
353 | num_nodes = MAX_NUMNODES; | 417 | pr_info("numa=fake=%d too large, reducing to %d\n", |
354 | size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / | 418 | nr_nodes, MAX_NUMNODES); |
355 | num_nodes; | 419 | nr_nodes = MAX_NUMNODES; |
420 | } | ||
421 | |||
422 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | ||
356 | /* | 423 | /* |
357 | * Calculate the number of big nodes that can be allocated as a result | 424 | * Calculate the number of big nodes that can be allocated as a result |
358 | * of consolidating the leftovers. | 425 | * of consolidating the remainder. |
359 | */ | 426 | */ |
360 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / | 427 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / |
361 | FAKE_NODE_MIN_SIZE; | 428 | FAKE_NODE_MIN_SIZE; |
362 | 429 | ||
363 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ | ||
364 | size &= FAKE_NODE_MIN_HASH_MASK; | 430 | size &= FAKE_NODE_MIN_HASH_MASK; |
365 | if (!size) { | 431 | if (!size) { |
366 | printk(KERN_ERR "Not enough memory for each node. " | 432 | pr_err("Not enough memory for each node. " |
367 | "NUMA emulation disabled.\n"); | 433 | "NUMA emulation disabled.\n"); |
368 | return -1; | 434 | return -1; |
369 | } | 435 | } |
370 | 436 | ||
371 | for (i = node_start; i < num_nodes + node_start; i++) { | 437 | for (i = 0; i < nr_phys_nodes; i++) |
372 | u64 end = *addr + size; | 438 | if (physnodes[i].start != physnodes[i].end) |
439 | node_set(i, physnode_mask); | ||
373 | 440 | ||
374 | if (i < big) | 441 | /* |
375 | end += FAKE_NODE_MIN_SIZE; | 442 | * Continue to fill physical nodes with fake nodes until there is no |
376 | /* | 443 | * memory left on any of them. |
377 | * The final node can have the remaining system RAM. Other | 444 | */ |
378 | * nodes receive roughly the same amount of available pages. | 445 | while (nodes_weight(physnode_mask)) { |
379 | */ | 446 | for_each_node_mask(i, physnode_mask) { |
380 | if (i == num_nodes + node_start - 1) | 447 | u64 end = physnodes[i].start + size; |
381 | end = max_addr; | 448 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); |
382 | else | 449 | |
383 | while (end - *addr - e820_hole_size(*addr, end) < | 450 | if (ret < big) |
384 | size) { | 451 | end += FAKE_NODE_MIN_SIZE; |
452 | |||
453 | /* | ||
454 | * Continue to add memory to this fake node if its | ||
455 | * non-reserved memory is less than the per-node size. | ||
456 | */ | ||
457 | while (end - physnodes[i].start - | ||
458 | e820_hole_size(physnodes[i].start, end) < size) { | ||
385 | end += FAKE_NODE_MIN_SIZE; | 459 | end += FAKE_NODE_MIN_SIZE; |
386 | if (end > max_addr) { | 460 | if (end > physnodes[i].end) { |
387 | end = max_addr; | 461 | end = physnodes[i].end; |
388 | break; | 462 | break; |
389 | } | 463 | } |
390 | } | 464 | } |
391 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | 465 | |
392 | break; | 466 | /* |
467 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
468 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
469 | * this one must extend to the boundary. | ||
470 | */ | ||
471 | if (end < dma32_end && dma32_end - end - | ||
472 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
473 | end = dma32_end; | ||
474 | |||
475 | /* | ||
476 | * If there won't be enough non-reserved memory for the | ||
477 | * next node, this one must extend to the end of the | ||
478 | * physical node. | ||
479 | */ | ||
480 | if (physnodes[i].end - end - | ||
481 | e820_hole_size(end, physnodes[i].end) < size) | ||
482 | end = physnodes[i].end; | ||
483 | |||
484 | /* | ||
485 | * Avoid allocating more nodes than requested, which can | ||
486 | * happen as a result of rounding down each node's size | ||
487 | * to FAKE_NODE_MIN_SIZE. | ||
488 | */ | ||
489 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | ||
490 | end = physnodes[i].end; | ||
491 | |||
492 | if (setup_node_range(ret++, &physnodes[i].start, | ||
493 | end - physnodes[i].start, | ||
494 | physnodes[i].end) < 0) | ||
495 | node_clear(i, physnode_mask); | ||
496 | } | ||
393 | } | 497 | } |
394 | return i - node_start + 1; | 498 | return ret; |
395 | } | 499 | } |
396 | 500 | ||
397 | /* | 501 | /* |
398 | * Splits the remaining system RAM into chunks of size. The remaining memory is | 502 | * Returns the end address of a node so that there is at least `size' amount of |
399 | * always assigned to a final node and can be asymmetric. Returns the number of | 503 | * non-reserved memory or `max_addr' is reached. |
400 | * nodes split. | ||
401 | */ | 504 | */ |
402 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | 505 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) |
403 | u64 max_addr, int node_start, u64 size) | ||
404 | { | 506 | { |
405 | int i = node_start; | 507 | u64 end = start + size; |
406 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | 508 | |
407 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | 509 | while (end - start - e820_hole_size(start, end) < size) { |
408 | ; | 510 | end += FAKE_NODE_MIN_SIZE; |
409 | return i - node_start; | 511 | if (end > max_addr) { |
512 | end = max_addr; | ||
513 | break; | ||
514 | } | ||
515 | } | ||
516 | return end; | ||
410 | } | 517 | } |
411 | 518 | ||
412 | /* | 519 | /* |
413 | * Sets up the system RAM area from start_pfn to last_pfn according to the | 520 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
414 | * numa=fake command-line option. | 521 | * `addr' to `max_addr'. The return value is the number of nodes allocated. |
415 | */ | 522 | */ |
416 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 523 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) |
417 | |||
418 | static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) | ||
419 | { | 524 | { |
420 | u64 size, addr = start_pfn << PAGE_SHIFT; | 525 | nodemask_t physnode_mask = NODE_MASK_NONE; |
421 | u64 max_addr = last_pfn << PAGE_SHIFT; | 526 | u64 min_size; |
422 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 527 | int ret = 0; |
528 | int i; | ||
423 | 529 | ||
424 | memset(&nodes, 0, sizeof(nodes)); | 530 | if (!size) |
531 | return -1; | ||
425 | /* | 532 | /* |
426 | * If the numa=fake command-line is just a single number N, split the | 533 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is |
427 | * system RAM into N fake nodes. | 534 | * increased accordingly if the requested size is too small. This |
535 | * creates a uniform distribution of node sizes across the entire | ||
536 | * machine (but not necessarily over physical nodes). | ||
428 | */ | 537 | */ |
429 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 538 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / |
430 | long n = simple_strtol(cmdline, NULL, 0); | 539 | MAX_NUMNODES; |
431 | 540 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | |
432 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | 541 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) |
433 | if (num_nodes < 0) | 542 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & |
434 | return num_nodes; | 543 | FAKE_NODE_MIN_HASH_MASK; |
435 | goto out; | 544 | if (size < min_size) { |
545 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
546 | size >> 20, min_size >> 20); | ||
547 | size = min_size; | ||
436 | } | 548 | } |
549 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
437 | 550 | ||
438 | /* Parse the command line. */ | 551 | for (i = 0; i < MAX_NUMNODES; i++) |
439 | for (coeff_flag = 0; ; cmdline++) { | 552 | if (physnodes[i].start != physnodes[i].end) |
440 | if (*cmdline && isdigit(*cmdline)) { | 553 | node_set(i, physnode_mask); |
441 | num = num * 10 + *cmdline - '0'; | 554 | /* |
442 | continue; | 555 | * Fill physical nodes with fake nodes of size until there is no memory |
443 | } | 556 | * left on any of them. |
444 | if (*cmdline == '*') { | 557 | */ |
445 | if (num > 0) | 558 | while (nodes_weight(physnode_mask)) { |
446 | coeff = num; | 559 | for_each_node_mask(i, physnode_mask) { |
447 | coeff_flag = 1; | 560 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; |
448 | } | 561 | u64 end; |
449 | if (!*cmdline || *cmdline == ',') { | 562 | |
450 | if (!coeff_flag) | 563 | end = find_end_of_node(physnodes[i].start, |
451 | coeff = 1; | 564 | physnodes[i].end, size); |
452 | /* | 565 | /* |
453 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | 566 | * If there won't be at least FAKE_NODE_MIN_SIZE of |
454 | * Command-line coefficients are in megabytes. | 567 | * non-reserved memory in ZONE_DMA32 for the next node, |
568 | * this one must extend to the boundary. | ||
455 | */ | 569 | */ |
456 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | 570 | if (end < dma32_end && dma32_end - end - |
457 | if (size) | 571 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) |
458 | for (i = 0; i < coeff; i++, num_nodes++) | 572 | end = dma32_end; |
459 | if (setup_node_range(num_nodes, nodes, | 573 | |
460 | &addr, size, max_addr) < 0) | 574 | /* |
461 | goto done; | 575 | * If there won't be enough non-reserved memory for the |
462 | if (!*cmdline) | 576 | * next node, this one must extend to the end of the |
463 | break; | 577 | * physical node. |
464 | coeff_flag = 0; | 578 | */ |
465 | coeff = -1; | 579 | if (physnodes[i].end - end - |
580 | e820_hole_size(end, physnodes[i].end) < size) | ||
581 | end = physnodes[i].end; | ||
582 | |||
583 | /* | ||
584 | * Setup the fake node that will be allocated as bootmem | ||
585 | * later. If setup_node_range() returns non-zero, there | ||
586 | * is no more memory available on this physical node. | ||
587 | */ | ||
588 | if (setup_node_range(ret++, &physnodes[i].start, | ||
589 | end - physnodes[i].start, | ||
590 | physnodes[i].end) < 0) | ||
591 | node_clear(i, physnode_mask); | ||
466 | } | 592 | } |
467 | num = 0; | ||
468 | } | 593 | } |
469 | done: | 594 | return ret; |
470 | if (!num_nodes) | 595 | } |
471 | return -1; | 596 | |
472 | /* Fill remainder of system RAM, if appropriate. */ | 597 | /* |
473 | if (addr < max_addr) { | 598 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
474 | if (coeff_flag && coeff < 0) { | 599 | * numa=fake command-line option. |
475 | /* Split remaining nodes into num-sized chunks */ | 600 | */ |
476 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | 601 | static int __init numa_emulation(unsigned long start_pfn, |
477 | num_nodes, num); | 602 | unsigned long last_pfn, int acpi, int k8) |
478 | goto out; | 603 | { |
479 | } | 604 | u64 addr = start_pfn << PAGE_SHIFT; |
480 | switch (*(cmdline - 1)) { | 605 | u64 max_addr = last_pfn << PAGE_SHIFT; |
481 | case '*': | 606 | int num_phys_nodes; |
482 | /* Split remaining nodes into coeff chunks */ | 607 | int num_nodes; |
483 | if (coeff <= 0) | 608 | int i; |
484 | break; | 609 | |
485 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | 610 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
486 | num_nodes, coeff); | 611 | /* |
487 | break; | 612 | * If the numa=fake command-line contains a 'M' or 'G', it represents |
488 | case ',': | 613 | * the fixed node size. Otherwise, if it is just a single number N, |
489 | /* Do not allocate remaining system RAM */ | 614 | * split the system RAM into N fake nodes. |
490 | break; | 615 | */ |
491 | default: | 616 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { |
492 | /* Give one final node */ | 617 | u64 size; |
493 | setup_node_range(num_nodes, nodes, &addr, | 618 | |
494 | max_addr - addr, max_addr); | 619 | size = memparse(cmdline, &cmdline); |
495 | num_nodes++; | 620 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); |
496 | } | 621 | } else { |
622 | unsigned long n; | ||
623 | |||
624 | n = simple_strtoul(cmdline, NULL, 0); | ||
625 | num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); | ||
497 | } | 626 | } |
498 | out: | 627 | |
628 | if (num_nodes < 0) | ||
629 | return num_nodes; | ||
499 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); | 630 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); |
500 | if (memnode_shift < 0) { | 631 | if (memnode_shift < 0) { |
501 | memnode_shift = 0; | 632 | memnode_shift = 0; |
@@ -505,14 +636,10 @@ out: | |||
505 | } | 636 | } |
506 | 637 | ||
507 | /* | 638 | /* |
508 | * We need to vacate all active ranges that may have been registered by | 639 | * We need to vacate all active ranges that may have been registered for |
509 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | 640 | * the e820 memory map. |
510 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
511 | */ | 641 | */ |
512 | remove_all_active_ranges(); | 642 | remove_all_active_ranges(); |
513 | #ifdef CONFIG_ACPI_NUMA | ||
514 | acpi_numa = -1; | ||
515 | #endif | ||
516 | for_each_node_mask(i, node_possible_map) { | 643 | for_each_node_mask(i, node_possible_map) { |
517 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 644 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
518 | nodes[i].end >> PAGE_SHIFT); | 645 | nodes[i].end >> PAGE_SHIFT); |
@@ -524,7 +651,8 @@ out: | |||
524 | } | 651 | } |
525 | #endif /* CONFIG_NUMA_EMU */ | 652 | #endif /* CONFIG_NUMA_EMU */ |
526 | 653 | ||
527 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) | 654 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, |
655 | int acpi, int k8) | ||
528 | { | 656 | { |
529 | int i; | 657 | int i; |
530 | 658 | ||
@@ -532,23 +660,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) | |||
532 | nodes_clear(node_online_map); | 660 | nodes_clear(node_online_map); |
533 | 661 | ||
534 | #ifdef CONFIG_NUMA_EMU | 662 | #ifdef CONFIG_NUMA_EMU |
535 | if (cmdline && !numa_emulation(start_pfn, last_pfn)) | 663 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) |
536 | return; | 664 | return; |
537 | nodes_clear(node_possible_map); | 665 | nodes_clear(node_possible_map); |
538 | nodes_clear(node_online_map); | 666 | nodes_clear(node_online_map); |
539 | #endif | 667 | #endif |
540 | 668 | ||
541 | #ifdef CONFIG_ACPI_NUMA | 669 | #ifdef CONFIG_ACPI_NUMA |
542 | if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | 670 | if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, |
543 | last_pfn << PAGE_SHIFT)) | 671 | last_pfn << PAGE_SHIFT)) |
544 | return; | 672 | return; |
545 | nodes_clear(node_possible_map); | 673 | nodes_clear(node_possible_map); |
546 | nodes_clear(node_online_map); | 674 | nodes_clear(node_online_map); |
547 | #endif | 675 | #endif |
548 | 676 | ||
549 | #ifdef CONFIG_K8_NUMA | 677 | #ifdef CONFIG_K8_NUMA |
550 | if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, | 678 | if (!numa_off && k8 && !k8_scan_nodes()) |
551 | last_pfn<<PAGE_SHIFT)) | ||
552 | return; | 679 | return; |
553 | nodes_clear(node_possible_map); | 680 | nodes_clear(node_possible_map); |
554 | nodes_clear(node_online_map); | 681 | nodes_clear(node_online_map); |
@@ -579,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void) | |||
579 | for_each_online_node(i) | 706 | for_each_online_node(i) |
580 | pages += free_all_bootmem_node(NODE_DATA(i)); | 707 | pages += free_all_bootmem_node(NODE_DATA(i)); |
581 | 708 | ||
709 | #ifdef CONFIG_NO_BOOTMEM | ||
710 | pages += free_all_memory_core_early(MAX_NUMNODES); | ||
711 | #endif | ||
712 | |||
582 | return pages; | 713 | return pages; |
583 | } | 714 | } |
584 | 715 | ||
@@ -601,6 +732,25 @@ static __init int numa_setup(char *opt) | |||
601 | early_param("numa", numa_setup); | 732 | early_param("numa", numa_setup); |
602 | 733 | ||
603 | #ifdef CONFIG_NUMA | 734 | #ifdef CONFIG_NUMA |
735 | |||
736 | static __init int find_near_online_node(int node) | ||
737 | { | ||
738 | int n, val; | ||
739 | int min_val = INT_MAX; | ||
740 | int best_node = -1; | ||
741 | |||
742 | for_each_online_node(n) { | ||
743 | val = node_distance(node, n); | ||
744 | |||
745 | if (val < min_val) { | ||
746 | min_val = val; | ||
747 | best_node = n; | ||
748 | } | ||
749 | } | ||
750 | |||
751 | return best_node; | ||
752 | } | ||
753 | |||
604 | /* | 754 | /* |
605 | * Setup early cpu_to_node. | 755 | * Setup early cpu_to_node. |
606 | * | 756 | * |
@@ -632,7 +782,7 @@ void __init init_cpu_to_node(void) | |||
632 | if (node == NUMA_NO_NODE) | 782 | if (node == NUMA_NO_NODE) |
633 | continue; | 783 | continue; |
634 | if (!node_online(node)) | 784 | if (!node_online(node)) |
635 | continue; | 785 | node = find_near_online_node(node); |
636 | numa_set_node(cpu, node); | 786 | numa_set_node(cpu, node); |
637 | } | 787 | } |
638 | } | 788 | } |