aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/numa_64.c
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2009-09-25 18:20:09 -0400
committerIngo Molnar <mingo@elte.hu>2009-10-12 16:56:46 -0400
commitadc1938994f7f1112d335d998b5218b0aa680ad6 (patch)
tree66b15981e346145fba39e3560ef8b192e2c7e10d /arch/x86/mm/numa_64.c
parent8716273caef7f55f39fe4fc6c69c5f9f197f41f1 (diff)
x86: Interleave emulated nodes over physical nodes
Add interleaved NUMA emulation support This patch interleaves emulated nodes over the system's physical nodes. This is required for interleave optimizations since mempolicies, for example, operate by iterating over a nodemask and act without knowledge of node distances. It can also be used for testing memory latencies and NUMA bugs in the kernel. There're a couple of ways to do this: - divide the number of emulated nodes by the number of physical nodes and allocate the result on each physical node, or - allocate each successive emulated node on a different physical node until all memory is exhausted. The disadvantage of the first option is, depending on the asymmetry in node capacities of each physical node, emulated nodes may substantially differ in size on a particular physical node compared to another. The disadvantage of the second option is, also depending on the asymmetry in node capacities of each physical node, there may be more emulated nodes allocated on a single physical node as another. This patch implements the second option; we sacrifice the possibility that we may have slightly more emulated nodes on a particular physical node compared to another in lieu of node size asymmetry. [ Note that "node capacity" of a physical node is not only a function of its addressable range, but also is affected by subtracting out the amount of reserved memory over that range. NUMA emulation only deals with available, non-reserved memory quantities. ] We ensure there is at least a minimal amount of available memory allocated to each node. We also make sure that at least this amount of available memory is available in ZONE_DMA32 for any node that includes both ZONE_DMA32 and ZONE_NORMAL. This patch also cleans the emulation code up by no longer passing the statically allocated struct bootnode array among the various functions. This init.data array is not allocated on the stack since it may be very large and thus it may be accessed at file scope. The WARN_ON() for nodes_cover_memory() when faking proximity domains is removed since it relies on successive nodes always having greater start addresses than previous nodes; with interleaving this is no longer always true. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Ankita Garg <ankita@in.ibm.com> Cc: Len Brown <len.brown@intel.com> LKML-Reference: <alpine.DEB.1.00.0909251519150.14754@chino.kir.corp.google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/mm/numa_64.c')
-rw-r--r--arch/x86/mm/numa_64.c211
1 files changed, 184 insertions, 27 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index d1a3d94efc8e..086f98a66d80 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -306,8 +306,71 @@ void __init numa_init_array(void)
306 306
307#ifdef CONFIG_NUMA_EMU 307#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 308/* Numa emulation */
309static struct bootnode nodes[MAX_NUMNODES] __initdata;
310static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 311static char *cmdline __initdata;
310 312
313static int __init setup_physnodes(unsigned long start, unsigned long end,
314 int acpi, int k8)
315{
316 int nr_nodes = 0;
317 int ret = 0;
318 int i;
319
320#ifdef CONFIG_ACPI_NUMA
321 if (acpi)
322 nr_nodes = acpi_get_nodes(physnodes);
323#endif
324#ifdef CONFIG_K8_NUMA
325 if (k8)
326 nr_nodes = k8_get_nodes(physnodes);
327#endif
328 /*
329 * Basic sanity checking on the physical node map: there may be errors
330 * if the SRAT or K8 incorrectly reported the topology or the mem=
331 * kernel parameter is used.
332 */
333 for (i = 0; i < nr_nodes; i++) {
334 if (physnodes[i].start == physnodes[i].end)
335 continue;
336 if (physnodes[i].start > end) {
337 physnodes[i].end = physnodes[i].start;
338 continue;
339 }
340 if (physnodes[i].end < start) {
341 physnodes[i].start = physnodes[i].end;
342 continue;
343 }
344 if (physnodes[i].start < start)
345 physnodes[i].start = start;
346 if (physnodes[i].end > end)
347 physnodes[i].end = end;
348 }
349
350 /*
351 * Remove all nodes that have no memory or were truncated because of the
352 * limited address range.
353 */
354 for (i = 0; i < nr_nodes; i++) {
355 if (physnodes[i].start == physnodes[i].end)
356 continue;
357 physnodes[ret].start = physnodes[i].start;
358 physnodes[ret].end = physnodes[i].end;
359 ret++;
360 }
361
362 /*
363 * If no physical topology was detected, a single node is faked to cover
364 * the entire address space.
365 */
366 if (!ret) {
367 physnodes[ret].start = start;
368 physnodes[ret].end = end;
369 ret = 1;
370 }
371 return ret;
372}
373
311/* 374/*
312 * Setups up nid to range from addr to addr + size. If the end 375 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 376 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +378,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 378 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 379 * the end of the node.
317 */ 380 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 381static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 382{
321 int ret = 0; 383 int ret = 0;
322
323 nodes[nid].start = *addr; 384 nodes[nid].start = *addr;
324 *addr += size; 385 *addr += size;
325 if (*addr >= max_addr) { 386 if (*addr >= max_addr) {
@@ -335,12 +396,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 396}
336 397
337/* 398/*
399 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
400 * to max_addr. The return value is the number of nodes allocated.
401 */
402static int __init split_nodes_interleave(u64 addr, u64 max_addr,
403 int nr_phys_nodes, int nr_nodes)
404{
405 nodemask_t physnode_mask = NODE_MASK_NONE;
406 u64 size;
407 int big;
408 int ret = 0;
409 int i;
410
411 if (nr_nodes <= 0)
412 return -1;
413 if (nr_nodes > MAX_NUMNODES) {
414 pr_info("numa=fake=%d too large, reducing to %d\n",
415 nr_nodes, MAX_NUMNODES);
416 nr_nodes = MAX_NUMNODES;
417 }
418
419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
420 /*
421 * Calculate the number of big nodes that can be allocated as a result
422 * of consolidating the remainder.
423 */
424 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
425 FAKE_NODE_MIN_SIZE;
426
427 size &= FAKE_NODE_MIN_HASH_MASK;
428 if (!size) {
429 pr_err("Not enough memory for each node. "
430 "NUMA emulation disabled.\n");
431 return -1;
432 }
433
434 for (i = 0; i < nr_phys_nodes; i++)
435 if (physnodes[i].start != physnodes[i].end)
436 node_set(i, physnode_mask);
437
438 /*
439 * Continue to fill physical nodes with fake nodes until there is no
440 * memory left on any of them.
441 */
442 while (nodes_weight(physnode_mask)) {
443 for_each_node_mask(i, physnode_mask) {
444 u64 end = physnodes[i].start + size;
445 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
446
447 if (ret < big)
448 end += FAKE_NODE_MIN_SIZE;
449
450 /*
451 * Continue to add memory to this fake node if its
452 * non-reserved memory is less than the per-node size.
453 */
454 while (end - physnodes[i].start -
455 e820_hole_size(physnodes[i].start, end) < size) {
456 end += FAKE_NODE_MIN_SIZE;
457 if (end > physnodes[i].end) {
458 end = physnodes[i].end;
459 break;
460 }
461 }
462
463 /*
464 * If there won't be at least FAKE_NODE_MIN_SIZE of
465 * non-reserved memory in ZONE_DMA32 for the next node,
466 * this one must extend to the boundary.
467 */
468 if (end < dma32_end && dma32_end - end -
469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470 end = dma32_end;
471
472 /*
473 * If there won't be enough non-reserved memory for the
474 * next node, this one must extend to the end of the
475 * physical node.
476 */
477 if (physnodes[i].end - end -
478 e820_hole_size(end, physnodes[i].end) < size)
479 end = physnodes[i].end;
480
481 /*
482 * Avoid allocating more nodes than requested, which can
483 * happen as a result of rounding down each node's size
484 * to FAKE_NODE_MIN_SIZE.
485 */
486 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
487 end = physnodes[i].end;
488
489 if (setup_node_range(ret++, &physnodes[i].start,
490 end - physnodes[i].start,
491 physnodes[i].end) < 0)
492 node_clear(i, physnode_mask);
493 }
494 }
495 return ret;
496}
497
498/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 499 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 500 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 501 * last node allocated.
341 */ 502 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 503static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 504 int num_nodes)
345{ 505{
346 unsigned int big; 506 unsigned int big;
@@ -388,7 +548,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 548 break;
389 } 549 }
390 } 550 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 551 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 552 break;
393 } 553 }
394 return i - node_start + 1; 554 return i - node_start + 1;
@@ -399,12 +559,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 559 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 560 * nodes split.
401 */ 561 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 562static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 563 u64 size)
404{ 564{
405 int i = node_start; 565 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 566 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 567 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 568 ;
409 return i - node_start; 569 return i - node_start;
410} 570}
@@ -413,15 +573,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 573 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 574 * numa=fake command-line option.
415 */ 575 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 576static int __init numa_emulation(unsigned long start_pfn,
417 577 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 578{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 579 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 580 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 581 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
582 int num_phys_nodes;
423 583
424 memset(&nodes, 0, sizeof(nodes)); 584 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 585 /*
426 * If the numa=fake command-line is just a single number N, split the 586 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 587 * system RAM into N fake nodes.
@@ -429,7 +589,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 589 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 590 long n = simple_strtol(cmdline, NULL, 0);
431 591
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 592 num_nodes = split_nodes_interleave(addr, max_addr,
593 num_phys_nodes, n);
433 if (num_nodes < 0) 594 if (num_nodes < 0)
434 return num_nodes; 595 return num_nodes;
435 goto out; 596 goto out;
@@ -456,8 +617,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 617 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 618 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 619 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 620 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 621 size, max_addr) < 0)
461 goto done; 622 goto done;
462 if (!*cmdline) 623 if (!*cmdline)
463 break; 624 break;
@@ -473,7 +634,7 @@ done:
473 if (addr < max_addr) { 634 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 635 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 636 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 637 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 638 num_nodes, num);
478 goto out; 639 goto out;
479 } 640 }
@@ -482,7 +643,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 643 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 644 if (coeff <= 0)
484 break; 645 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 646 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 647 num_nodes, coeff);
487 break; 648 break;
488 case ',': 649 case ',':
@@ -490,8 +651,8 @@ done:
490 break; 651 break;
491 default: 652 default:
492 /* Give one final node */ 653 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 654 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 655 max_addr);
495 num_nodes++; 656 num_nodes++;
496 } 657 }
497 } 658 }
@@ -505,14 +666,10 @@ out:
505 } 666 }
506 667
507 /* 668 /*
508 * We need to vacate all active ranges that may have been registered by 669 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 670 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 671 */
512 remove_all_active_ranges(); 672 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 673 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 674 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 675 nodes[i].end >> PAGE_SHIFT);
@@ -533,7 +690,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
533 nodes_clear(node_online_map); 690 nodes_clear(node_online_map);
534 691
535#ifdef CONFIG_NUMA_EMU 692#ifdef CONFIG_NUMA_EMU
536 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 693 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
537 return; 694 return;
538 nodes_clear(node_possible_map); 695 nodes_clear(node_possible_map);
539 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);