diff options
author | David Rientjes <rientjes@google.com> | 2009-09-25 18:20:09 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-10-12 16:56:46 -0400 |
commit | adc1938994f7f1112d335d998b5218b0aa680ad6 (patch) | |
tree | 66b15981e346145fba39e3560ef8b192e2c7e10d /arch/x86/mm/numa_64.c | |
parent | 8716273caef7f55f39fe4fc6c69c5f9f197f41f1 (diff) |
x86: Interleave emulated nodes over physical nodes
Add interleaved NUMA emulation support
This patch interleaves emulated nodes over the system's physical
nodes. This is required for interleave optimizations since
mempolicies, for example, operate by iterating over a nodemask and
act without knowledge of node distances. It can also be used for
testing memory latencies and NUMA bugs in the kernel.
There're a couple of ways to do this:
- divide the number of emulated nodes by the number of physical
nodes and allocate the result on each physical node, or
- allocate each successive emulated node on a different physical
node until all memory is exhausted.
The disadvantage of the first option is, depending on the asymmetry
in node capacities of each physical node, emulated nodes may
substantially differ in size on a particular physical node compared
to another.
The disadvantage of the second option is, also depending on the
asymmetry in node capacities of each physical node, there may be
more emulated nodes allocated on a single physical node as another.
This patch implements the second option; we sacrifice the
possibility that we may have slightly more emulated nodes on a
particular physical node compared to another in lieu of node size
asymmetry.
[ Note that "node capacity" of a physical node is not only a
function of its addressable range, but also is affected by
subtracting out the amount of reserved memory over that range.
NUMA emulation only deals with available, non-reserved memory
quantities. ]
We ensure there is at least a minimal amount of available memory
allocated to each node. We also make sure that at least this
amount of available memory is available in ZONE_DMA32 for any node
that includes both ZONE_DMA32 and ZONE_NORMAL.
This patch also cleans the emulation code up by no longer passing
the statically allocated struct bootnode array among the various
functions. This init.data array is not allocated on the stack since
it may be very large and thus it may be accessed at file scope.
The WARN_ON() for nodes_cover_memory() when faking proximity
domains is removed since it relies on successive nodes always
having greater start addresses than previous nodes; with
interleaving this is no longer always true.
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Ankita Garg <ankita@in.ibm.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r-- | arch/x86/mm/numa_64.c | 211 |
1 files changed, 184 insertions, 27 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index d1a3d94efc8e..086f98a66d80 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -306,8 +306,71 @@ void __init numa_init_array(void) | |||
306 | 306 | ||
307 | #ifdef CONFIG_NUMA_EMU | 307 | #ifdef CONFIG_NUMA_EMU |
308 | /* Numa emulation */ | 308 | /* Numa emulation */ |
309 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
310 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | ||
309 | static char *cmdline __initdata; | 311 | static char *cmdline __initdata; |
310 | 312 | ||
313 | static int __init setup_physnodes(unsigned long start, unsigned long end, | ||
314 | int acpi, int k8) | ||
315 | { | ||
316 | int nr_nodes = 0; | ||
317 | int ret = 0; | ||
318 | int i; | ||
319 | |||
320 | #ifdef CONFIG_ACPI_NUMA | ||
321 | if (acpi) | ||
322 | nr_nodes = acpi_get_nodes(physnodes); | ||
323 | #endif | ||
324 | #ifdef CONFIG_K8_NUMA | ||
325 | if (k8) | ||
326 | nr_nodes = k8_get_nodes(physnodes); | ||
327 | #endif | ||
328 | /* | ||
329 | * Basic sanity checking on the physical node map: there may be errors | ||
330 | * if the SRAT or K8 incorrectly reported the topology or the mem= | ||
331 | * kernel parameter is used. | ||
332 | */ | ||
333 | for (i = 0; i < nr_nodes; i++) { | ||
334 | if (physnodes[i].start == physnodes[i].end) | ||
335 | continue; | ||
336 | if (physnodes[i].start > end) { | ||
337 | physnodes[i].end = physnodes[i].start; | ||
338 | continue; | ||
339 | } | ||
340 | if (physnodes[i].end < start) { | ||
341 | physnodes[i].start = physnodes[i].end; | ||
342 | continue; | ||
343 | } | ||
344 | if (physnodes[i].start < start) | ||
345 | physnodes[i].start = start; | ||
346 | if (physnodes[i].end > end) | ||
347 | physnodes[i].end = end; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Remove all nodes that have no memory or were truncated because of the | ||
352 | * limited address range. | ||
353 | */ | ||
354 | for (i = 0; i < nr_nodes; i++) { | ||
355 | if (physnodes[i].start == physnodes[i].end) | ||
356 | continue; | ||
357 | physnodes[ret].start = physnodes[i].start; | ||
358 | physnodes[ret].end = physnodes[i].end; | ||
359 | ret++; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * If no physical topology was detected, a single node is faked to cover | ||
364 | * the entire address space. | ||
365 | */ | ||
366 | if (!ret) { | ||
367 | physnodes[ret].start = start; | ||
368 | physnodes[ret].end = end; | ||
369 | ret = 1; | ||
370 | } | ||
371 | return ret; | ||
372 | } | ||
373 | |||
311 | /* | 374 | /* |
312 | * Setups up nid to range from addr to addr + size. If the end | 375 | * Setups up nid to range from addr to addr + size. If the end |
313 | * boundary is greater than max_addr, then max_addr is used instead. | 376 | * boundary is greater than max_addr, then max_addr is used instead. |
@@ -315,11 +378,9 @@ static char *cmdline __initdata; | |||
315 | * allocation past addr and -1 otherwise. addr is adjusted to be at | 378 | * allocation past addr and -1 otherwise. addr is adjusted to be at |
316 | * the end of the node. | 379 | * the end of the node. |
317 | */ | 380 | */ |
318 | static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | 381 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) |
319 | u64 size, u64 max_addr) | ||
320 | { | 382 | { |
321 | int ret = 0; | 383 | int ret = 0; |
322 | |||
323 | nodes[nid].start = *addr; | 384 | nodes[nid].start = *addr; |
324 | *addr += size; | 385 | *addr += size; |
325 | if (*addr >= max_addr) { | 386 | if (*addr >= max_addr) { |
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, | |||
335 | } | 396 | } |
336 | 397 | ||
337 | /* | 398 | /* |
399 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
400 | * to max_addr. The return value is the number of nodes allocated. | ||
401 | */ | ||
402 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, | ||
403 | int nr_phys_nodes, int nr_nodes) | ||
404 | { | ||
405 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
406 | u64 size; | ||
407 | int big; | ||
408 | int ret = 0; | ||
409 | int i; | ||
410 | |||
411 | if (nr_nodes <= 0) | ||
412 | return -1; | ||
413 | if (nr_nodes > MAX_NUMNODES) { | ||
414 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
415 | nr_nodes, MAX_NUMNODES); | ||
416 | nr_nodes = MAX_NUMNODES; | ||
417 | } | ||
418 | |||
419 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | ||
420 | /* | ||
421 | * Calculate the number of big nodes that can be allocated as a result | ||
422 | * of consolidating the remainder. | ||
423 | */ | ||
424 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / | ||
425 | FAKE_NODE_MIN_SIZE; | ||
426 | |||
427 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
428 | if (!size) { | ||
429 | pr_err("Not enough memory for each node. " | ||
430 | "NUMA emulation disabled.\n"); | ||
431 | return -1; | ||
432 | } | ||
433 | |||
434 | for (i = 0; i < nr_phys_nodes; i++) | ||
435 | if (physnodes[i].start != physnodes[i].end) | ||
436 | node_set(i, physnode_mask); | ||
437 | |||
438 | /* | ||
439 | * Continue to fill physical nodes with fake nodes until there is no | ||
440 | * memory left on any of them. | ||
441 | */ | ||
442 | while (nodes_weight(physnode_mask)) { | ||
443 | for_each_node_mask(i, physnode_mask) { | ||
444 | u64 end = physnodes[i].start + size; | ||
445 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
446 | |||
447 | if (ret < big) | ||
448 | end += FAKE_NODE_MIN_SIZE; | ||
449 | |||
450 | /* | ||
451 | * Continue to add memory to this fake node if its | ||
452 | * non-reserved memory is less than the per-node size. | ||
453 | */ | ||
454 | while (end - physnodes[i].start - | ||
455 | e820_hole_size(physnodes[i].start, end) < size) { | ||
456 | end += FAKE_NODE_MIN_SIZE; | ||
457 | if (end > physnodes[i].end) { | ||
458 | end = physnodes[i].end; | ||
459 | break; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
465 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
466 | * this one must extend to the boundary. | ||
467 | */ | ||
468 | if (end < dma32_end && dma32_end - end - | ||
469 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
470 | end = dma32_end; | ||
471 | |||
472 | /* | ||
473 | * If there won't be enough non-reserved memory for the | ||
474 | * next node, this one must extend to the end of the | ||
475 | * physical node. | ||
476 | */ | ||
477 | if (physnodes[i].end - end - | ||
478 | e820_hole_size(end, physnodes[i].end) < size) | ||
479 | end = physnodes[i].end; | ||
480 | |||
481 | /* | ||
482 | * Avoid allocating more nodes than requested, which can | ||
483 | * happen as a result of rounding down each node's size | ||
484 | * to FAKE_NODE_MIN_SIZE. | ||
485 | */ | ||
486 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | ||
487 | end = physnodes[i].end; | ||
488 | |||
489 | if (setup_node_range(ret++, &physnodes[i].start, | ||
490 | end - physnodes[i].start, | ||
491 | physnodes[i].end) < 0) | ||
492 | node_clear(i, physnode_mask); | ||
493 | } | ||
494 | } | ||
495 | return ret; | ||
496 | } | ||
497 | |||
498 | /* | ||
338 | * Splits num_nodes nodes up equally starting at node_start. The return value | 499 | * Splits num_nodes nodes up equally starting at node_start. The return value |
339 | * is the number of nodes split up and addr is adjusted to be at the end of the | 500 | * is the number of nodes split up and addr is adjusted to be at the end of the |
340 | * last node allocated. | 501 | * last node allocated. |
341 | */ | 502 | */ |
342 | static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | 503 | static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, |
343 | u64 max_addr, int node_start, | ||
344 | int num_nodes) | 504 | int num_nodes) |
345 | { | 505 | { |
346 | unsigned int big; | 506 | unsigned int big; |
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
388 | break; | 548 | break; |
389 | } | 549 | } |
390 | } | 550 | } |
391 | if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) | 551 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) |
392 | break; | 552 | break; |
393 | } | 553 | } |
394 | return i - node_start + 1; | 554 | return i - node_start + 1; |
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, | |||
399 | * always assigned to a final node and can be asymmetric. Returns the number of | 559 | * always assigned to a final node and can be asymmetric. Returns the number of |
400 | * nodes split. | 560 | * nodes split. |
401 | */ | 561 | */ |
402 | static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | 562 | static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, |
403 | u64 max_addr, int node_start, u64 size) | 563 | u64 size) |
404 | { | 564 | { |
405 | int i = node_start; | 565 | int i = node_start; |
406 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | 566 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; |
407 | while (!setup_node_range(i++, nodes, addr, size, max_addr)) | 567 | while (!setup_node_range(i++, addr, size, max_addr)) |
408 | ; | 568 | ; |
409 | return i - node_start; | 569 | return i - node_start; |
410 | } | 570 | } |
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, | |||
413 | * Sets up the system RAM area from start_pfn to last_pfn according to the | 573 | * Sets up the system RAM area from start_pfn to last_pfn according to the |
414 | * numa=fake command-line option. | 574 | * numa=fake command-line option. |
415 | */ | 575 | */ |
416 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | 576 | static int __init numa_emulation(unsigned long start_pfn, |
417 | 577 | unsigned long last_pfn, int acpi, int k8) | |
418 | static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn) | ||
419 | { | 578 | { |
420 | u64 size, addr = start_pfn << PAGE_SHIFT; | 579 | u64 size, addr = start_pfn << PAGE_SHIFT; |
421 | u64 max_addr = last_pfn << PAGE_SHIFT; | 580 | u64 max_addr = last_pfn << PAGE_SHIFT; |
422 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | 581 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; |
582 | int num_phys_nodes; | ||
423 | 583 | ||
424 | memset(&nodes, 0, sizeof(nodes)); | 584 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
425 | /* | 585 | /* |
426 | * If the numa=fake command-line is just a single number N, split the | 586 | * If the numa=fake command-line is just a single number N, split the |
427 | * system RAM into N fake nodes. | 587 | * system RAM into N fake nodes. |
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
429 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 589 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { |
430 | long n = simple_strtol(cmdline, NULL, 0); | 590 | long n = simple_strtol(cmdline, NULL, 0); |
431 | 591 | ||
432 | num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); | 592 | num_nodes = split_nodes_interleave(addr, max_addr, |
593 | num_phys_nodes, n); | ||
433 | if (num_nodes < 0) | 594 | if (num_nodes < 0) |
434 | return num_nodes; | 595 | return num_nodes; |
435 | goto out; | 596 | goto out; |
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn | |||
456 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | 617 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; |
457 | if (size) | 618 | if (size) |
458 | for (i = 0; i < coeff; i++, num_nodes++) | 619 | for (i = 0; i < coeff; i++, num_nodes++) |
459 | if (setup_node_range(num_nodes, nodes, | 620 | if (setup_node_range(num_nodes, &addr, |
460 | &addr, size, max_addr) < 0) | 621 | size, max_addr) < 0) |
461 | goto done; | 622 | goto done; |
462 | if (!*cmdline) | 623 | if (!*cmdline) |
463 | break; | 624 | break; |
@@ -473,7 +634,7 @@ done: | |||
473 | if (addr < max_addr) { | 634 | if (addr < max_addr) { |
474 | if (coeff_flag && coeff < 0) { | 635 | if (coeff_flag && coeff < 0) { |
475 | /* Split remaining nodes into num-sized chunks */ | 636 | /* Split remaining nodes into num-sized chunks */ |
476 | num_nodes += split_nodes_by_size(nodes, &addr, max_addr, | 637 | num_nodes += split_nodes_by_size(&addr, max_addr, |
477 | num_nodes, num); | 638 | num_nodes, num); |
478 | goto out; | 639 | goto out; |
479 | } | 640 | } |
@@ -482,7 +643,7 @@ done: | |||
482 | /* Split remaining nodes into coeff chunks */ | 643 | /* Split remaining nodes into coeff chunks */ |
483 | if (coeff <= 0) | 644 | if (coeff <= 0) |
484 | break; | 645 | break; |
485 | num_nodes += split_nodes_equally(nodes, &addr, max_addr, | 646 | num_nodes += split_nodes_equally(&addr, max_addr, |
486 | num_nodes, coeff); | 647 | num_nodes, coeff); |
487 | break; | 648 | break; |
488 | case ',': | 649 | case ',': |
@@ -490,8 +651,8 @@ done: | |||
490 | break; | 651 | break; |
491 | default: | 652 | default: |
492 | /* Give one final node */ | 653 | /* Give one final node */ |
493 | setup_node_range(num_nodes, nodes, &addr, | 654 | setup_node_range(num_nodes, &addr, max_addr - addr, |
494 | max_addr - addr, max_addr); | 655 | max_addr); |
495 | num_nodes++; | 656 | num_nodes++; |
496 | } | 657 | } |
497 | } | 658 | } |
@@ -505,14 +666,10 @@ out: | |||
505 | } | 666 | } |
506 | 667 | ||
507 | /* | 668 | /* |
508 | * We need to vacate all active ranges that may have been registered by | 669 | * We need to vacate all active ranges that may have been registered for |
509 | * SRAT and set acpi_numa to -1 so that srat_disabled() always returns | 670 | * the e820 memory map. |
510 | * true. NUMA emulation has succeeded so we will not scan ACPI nodes. | ||
511 | */ | 671 | */ |
512 | remove_all_active_ranges(); | 672 | remove_all_active_ranges(); |
513 | #ifdef CONFIG_ACPI_NUMA | ||
514 | acpi_numa = -1; | ||
515 | #endif | ||
516 | for_each_node_mask(i, node_possible_map) { | 673 | for_each_node_mask(i, node_possible_map) { |
517 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | 674 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, |
518 | nodes[i].end >> PAGE_SHIFT); | 675 | nodes[i].end >> PAGE_SHIFT); |
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | |||
533 | nodes_clear(node_online_map); | 690 | nodes_clear(node_online_map); |
534 | 691 | ||
535 | #ifdef CONFIG_NUMA_EMU | 692 | #ifdef CONFIG_NUMA_EMU |
536 | if (cmdline && !numa_emulation(start_pfn, last_pfn)) | 693 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) |
537 | return; | 694 | return; |
538 | nodes_clear(node_possible_map); | 695 | nodes_clear(node_possible_map); |
539 | nodes_clear(node_online_map); | 696 | nodes_clear(node_online_map); |