aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-02-15 16:43:30 -0500
committerH. Peter Anvin <hpa@zytor.com>2010-02-15 17:34:10 -0500
commit8df5bb34defd685fe86f60746bbf3d47d1c6f033 (patch)
treef2561781a9e493b297a3872b03ff6e4a23a5576c
parent68fd111e02b979876359c7b471a8bcbca0628b75 (diff)
x86, numa: Add fixed node size option for numa emulation
numa=fake=N specifies the number of fake nodes, N, to partition the system into and then allocates them by interleaving over physical nodes. This requires knowledge of the system capacity when attempting to allocate nodes of a certain size: either very large nodes to benchmark scalability of code that operates on individual nodes, or very small nodes to find bugs in the VM. This patch introduces numa=fake=<size>[MG] so it is possible to specify the size of each node to allocate. When used, nodes of the size specified will be allocated and interleaved over the set of physical nodes. FAKE_NODE_MIN_SIZE was also moved to the more-appropriate include/asm/numa_64.h. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1002151342510.26927@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
-rw-r--r--Documentation/x86/x86_64/boot-options.txt4
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/mm/numa_64.c117
4 files changed, 118 insertions, 14 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8bc7d3..01150c64aa73 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -166,6 +166,10 @@ NUMA
166 166
167 numa=noacpi Don't parse the SRAT table for NUMA setup 167 numa=noacpi Don't parse the SRAT table for NUMA setup
168 168
169 numa=fake=<size>[MG]
170 If given as a memory unit, fills all system RAM with nodes of
171 size interleaved over physical nodes.
172
169 numa=fake=CMDLINE 173 numa=fake=CMDLINE
170 If a number, fakes CMDLINE nodes and ignores NUMA setup of the 174 If a number, fakes CMDLINE nodes and ignores NUMA setup of the
171 actual machine. Otherwise, system memory is configured 175 actual machine. Otherwise, system memory is configured
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2ecbe0ca0dfc..c47c78ba3aca 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -502,6 +502,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 502}
503 503
504/* 504/*
505 * Returns the end address of a node so that there is at least `size' amount of
506 * non-reserved memory or `max_addr' is reached.
507 */
508static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
509{
510 u64 end = start + size;
511
512 while (end - start - e820_hole_size(start, end) < size) {
513 end += FAKE_NODE_MIN_SIZE;
514 if (end > max_addr) {
515 end = max_addr;
516 break;
517 }
518 }
519 return end;
520}
521
522/*
523 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
524 * `addr' to `max_addr'. The return value is the number of nodes allocated.
525 */
526static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
527{
528 nodemask_t physnode_mask = NODE_MASK_NONE;
529 u64 min_size;
530 int ret = 0;
531 int i;
532
533 if (!size)
534 return -1;
535 /*
536 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
537 * increased accordingly if the requested size is too small. This
538 * creates a uniform distribution of node sizes across the entire
539 * machine (but not necessarily over physical nodes).
540 */
541 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
542 MAX_NUMNODES;
543 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
544 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
545 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
546 FAKE_NODE_MIN_HASH_MASK;
547 if (size < min_size) {
548 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
549 size >> 20, min_size >> 20);
550 size = min_size;
551 }
552 size &= FAKE_NODE_MIN_HASH_MASK;
553
554 for (i = 0; i < MAX_NUMNODES; i++)
555 if (physnodes[i].start != physnodes[i].end)
556 node_set(i, physnode_mask);
557 /*
558 * Fill physical nodes with fake nodes of size until there is no memory
559 * left on any of them.
560 */
561 while (nodes_weight(physnode_mask)) {
562 for_each_node_mask(i, physnode_mask) {
563 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
564 u64 end;
565
566 end = find_end_of_node(physnodes[i].start,
567 physnodes[i].end, size);
568 /*
569 * If there won't be at least FAKE_NODE_MIN_SIZE of
570 * non-reserved memory in ZONE_DMA32 for the next node,
571 * this one must extend to the boundary.
572 */
573 if (end < dma32_end && dma32_end - end -
574 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
575 end = dma32_end;
576
577 /*
578 * If there won't be enough non-reserved memory for the
579 * next node, this one must extend to the end of the
580 * physical node.
581 */
582 if (physnodes[i].end - end -
583 e820_hole_size(end, physnodes[i].end) < size)
584 end = physnodes[i].end;
585
586 /*
587 * Setup the fake node that will be allocated as bootmem
588 * later. If setup_node_range() returns non-zero, there
589 * is no more memory available on this physical node.
590 */
591 if (setup_node_range(ret++, &physnodes[i].start,
592 end - physnodes[i].start,
593 physnodes[i].end) < 0)
594 node_clear(i, physnode_mask);
595 }
596 }
597 return ret;
598}
599
600/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 601 * Splits num_nodes nodes up equally starting at node_start. The return value
506 * is the number of nodes split up and addr is adjusted to be at the end of the 602 * is the number of nodes split up and addr is adjusted to be at the end of the
507 * last node allocated. 603 * last node allocated.
@@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
546 if (i == num_nodes + node_start - 1) 642 if (i == num_nodes + node_start - 1)
547 end = max_addr; 643 end = max_addr;
548 else 644 else
549 while (end - *addr - e820_hole_size(*addr, end) < 645 end = find_end_of_node(*addr, max_addr, size);
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0) 646 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 647 break;
559 } 648 }
@@ -589,6 +678,18 @@ static int __init numa_emulation(unsigned long start_pfn,
589 678
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 679 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 680 /*
681 * If the numa=fake command-line contains a 'M' or 'G', it represents
682 * the fixed node size.
683 */
684 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
685 size = memparse(cmdline, &cmdline);
686 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
687 if (num_nodes < 0)
688 return num_nodes;
689 goto out;
690 }
691
692 /*
592 * If the numa=fake command-line is just a single number N, split the 693 * If the numa=fake command-line is just a single number N, split the
593 * system RAM into N fake nodes. 694 * system RAM into N fake nodes.
594 */ 695 */