aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/x86_64/boot-options.txt20
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/kernel/acpi/boot.c21
-rw-r--r--arch/x86/mm/numa_64.c235
5 files changed, 134 insertions, 153 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 29a6ff8bc7d3..7fbbaf85f5b7 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -166,19 +166,13 @@ NUMA
166 166
167 numa=noacpi Don't parse the SRAT table for NUMA setup 167 numa=noacpi Don't parse the SRAT table for NUMA setup
168 168
169 numa=fake=CMDLINE 169 numa=fake=<size>[MG]
170 If a number, fakes CMDLINE nodes and ignores NUMA setup of the 170 If given as a memory unit, fills all system RAM with nodes of
171 actual machine. Otherwise, system memory is configured 171 size interleaved over physical nodes.
172 depending on the sizes and coefficients listed. For example: 172
173 numa=fake=2*512,1024,4*256,*128 173 numa=fake=<N>
174 gives two 512M nodes, a 1024M node, four 256M nodes, and the 174 If given as an integer, fills all system RAM with N fake nodes
175 rest split into 128M chunks. If the last character of CMDLINE 175 interleaved over physical nodes.
176 is a *, the remaining memory is divided up equally among its
177 coefficient:
178 numa=fake=2*512,2*
179 gives two 512M nodes and the rest split into two nodes.
180 Otherwise, the remaining system RAM is allocated to an
181 additional node.
182 176
183ACPI 177ACPI
184 178
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index af1c5833ff23..f95703098f8d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 49
50#ifdef CONFIG_X86_64 50#ifdef CONFIG_X86_64
51# include <asm/proto.h> 51# include <asm/proto.h>
52# include <asm/numa_64.h>
52#endif /* X86 */ 53#endif /* X86 */
53 54
54#define BAD_MADT_ENTRY(entry, end) ( \ 55#define BAD_MADT_ENTRY(entry, end) ( \
@@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
482 */ 483 */
483#ifdef CONFIG_ACPI_HOTPLUG_CPU 484#ifdef CONFIG_ACPI_HOTPLUG_CPU
484 485
486static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
487{
488#ifdef CONFIG_ACPI_NUMA
489 int nid;
490
491 nid = acpi_get_node(handle);
492 if (nid == -1 || !node_online(nid))
493 return;
494#ifdef CONFIG_X86_64
495 apicid_to_node[physid] = nid;
496 numa_set_node(cpu, nid);
497#else /* CONFIG_X86_32 */
498 apicid_2_node[physid] = nid;
499 cpu_to_node_map[cpu] = nid;
500#endif
501
502#endif
503}
504
485static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 505static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
486{ 506{
487 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; 507 struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
@@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
540 } 560 }
541 561
542 cpu = cpumask_first(new_map); 562 cpu = cpumask_first(new_map);
563 acpi_map_cpu2node(handle, cpu, physid);
543 564
544 *pcpu = cpu; 565 *pcpu = cpu;
545 retval = 0; 566 retval = 0;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..3307ea8bd43a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 428 * of consolidating the remainder.
429 */ 429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 431 FAKE_NODE_MIN_SIZE;
432 432
433 size &= FAKE_NODE_MIN_HASH_MASK; 433 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +502,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 502}
503 503
504/* 504/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 507 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 508static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 509{
512 unsigned int big; 510 u64 end = start + size;
513 u64 size;
514 int i;
515
516 if (num_nodes <= 0)
517 return -1;
518 if (num_nodes > MAX_NUMNODES)
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539 511
540 if (i < big) 512 while (end - start - e820_hole_size(start, end) < size) {
541 end += FAKE_NODE_MIN_SIZE; 513 end += FAKE_NODE_MIN_SIZE;
542 /* 514 if (end > max_addr) {
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 515 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 516 break;
517 }
559 } 518 }
560 return i - node_start + 1; 519 return end;
561} 520}
562 521
563/* 522/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 523 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 524 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 525 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 526static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 527{
571 int i = node_start; 528 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 529 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 530 int ret = 0;
574 ; 531 int i;
575 return i - node_start; 532
533 if (!size)
534 return -1;
535 /*
536 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
537 * increased accordingly if the requested size is too small. This
538 * creates a uniform distribution of node sizes across the entire
539 * machine (but not necessarily over physical nodes).
540 */
541 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
542 MAX_NUMNODES;
543 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
544 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
545 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
546 FAKE_NODE_MIN_HASH_MASK;
547 if (size < min_size) {
548 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
549 size >> 20, min_size >> 20);
550 size = min_size;
551 }
552 size &= FAKE_NODE_MIN_HASH_MASK;
553
554 for (i = 0; i < MAX_NUMNODES; i++)
555 if (physnodes[i].start != physnodes[i].end)
556 node_set(i, physnode_mask);
557 /*
558 * Fill physical nodes with fake nodes of size until there is no memory
559 * left on any of them.
560 */
561 while (nodes_weight(physnode_mask)) {
562 for_each_node_mask(i, physnode_mask) {
563 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
564 u64 end;
565
566 end = find_end_of_node(physnodes[i].start,
567 physnodes[i].end, size);
568 /*
569 * If there won't be at least FAKE_NODE_MIN_SIZE of
570 * non-reserved memory in ZONE_DMA32 for the next node,
571 * this one must extend to the boundary.
572 */
573 if (end < dma32_end && dma32_end - end -
574 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
575 end = dma32_end;
576
577 /*
578 * If there won't be enough non-reserved memory for the
579 * next node, this one must extend to the end of the
580 * physical node.
581 */
582 if (physnodes[i].end - end -
583 e820_hole_size(end, physnodes[i].end) < size)
584 end = physnodes[i].end;
585
586 /*
587 * Setup the fake node that will be allocated as bootmem
588 * later. If setup_node_range() returns non-zero, there
589 * is no more memory available on this physical node.
590 */
591 if (setup_node_range(ret++, &physnodes[i].start,
592 end - physnodes[i].start,
593 physnodes[i].end) < 0)
594 node_clear(i, physnode_mask);
595 }
596 }
597 return ret;
576} 598}
577 599
578/* 600/*
@@ -582,87 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 604static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 605 unsigned long last_pfn, int acpi, int k8)
584{ 606{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 607 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 608 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 609 int num_phys_nodes;
610 int num_nodes;
611 int i;
589 612
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 613 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 614 /*
592 * If the numa=fake command-line is just a single number N, split the 615 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 616 * the fixed node size. Otherwise, if it is just a single number N,
617 * split the system RAM into N fake nodes.
594 */ 618 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 619 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 620 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 621
605 /* Parse the command line. */ 622 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 623 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 624 } else {
608 num = num * 10 + *cmdline - '0'; 625 unsigned long n;
609 continue; 626
610 } 627 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 628 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 629 }
665out: 630
631 if (num_nodes < 0)
632 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 633 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 634 if (memnode_shift < 0) {
668 memnode_shift = 0; 635 memnode_shift = 0;