diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-02-28 13:39:36 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-02-28 13:39:36 -0500 |
commit | 1eca9acbf9cda6437db7de1097c7a18014b1289d (patch) | |
tree | 04c03ae847df74e45eac2eba5920761986a779c6 /arch | |
parent | 0091945b4732469bb39bbb4556ce08a25d89d1c2 (diff) | |
parent | ca2107c9d6cf44fb915402d6f12b9d9ff3925cd7 (diff) |
Merge branch 'x86-numa-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-numa-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
x86, numa: Remove configurable node size support for numa emulation
x86, numa: Add fixed node size option for numa emulation
x86, numa: Fix numa emulation calculation of big nodes
x86, acpi: Map hotadded cpu to correct node.
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/mmzone_64.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/numa_64.h | 5 | ||||
-rw-r--r-- | arch/x86/kernel/acpi/boot.c | 21 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 235 |
4 files changed, 127 insertions, 140 deletions
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h | |||
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) | |||
39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ | 40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ |
41 | NODE_DATA(nid)->node_spanned_pages) | 41 | NODE_DATA(nid)->node_spanned_pages) |
42 | |||
43 | #ifdef CONFIG_NUMA_EMU | ||
44 | #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) | ||
45 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
46 | #endif | ||
47 | |||
48 | #endif | 42 | #endif |
49 | #endif /* _ASM_X86_MMZONE_64_H */ | 43 | #endif /* _ASM_X86_MMZONE_64_H */ |
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h | |||
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); | |||
36 | extern void __cpuinit numa_clear_node(int cpu); | 36 | extern void __cpuinit numa_clear_node(int cpu); |
37 | extern void __cpuinit numa_add_cpu(int cpu); | 37 | extern void __cpuinit numa_add_cpu(int cpu); |
38 | extern void __cpuinit numa_remove_cpu(int cpu); | 38 | extern void __cpuinit numa_remove_cpu(int cpu); |
39 | |||
40 | #ifdef CONFIG_NUMA_EMU | ||
41 | #define FAKE_NODE_MIN_SIZE ((u64)64 << 20) | ||
42 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
43 | #endif /* CONFIG_NUMA_EMU */ | ||
39 | #else | 44 | #else |
40 | static inline void init_cpu_to_node(void) { } | 45 | static inline void init_cpu_to_node(void) { } |
41 | static inline void numa_set_node(int cpu, int node) { } | 46 | static inline void numa_set_node(int cpu, int node) { } |
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index af1c5833ff23..f95703098f8d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c | |||
@@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); | |||
49 | 49 | ||
50 | #ifdef CONFIG_X86_64 | 50 | #ifdef CONFIG_X86_64 |
51 | # include <asm/proto.h> | 51 | # include <asm/proto.h> |
52 | # include <asm/numa_64.h> | ||
52 | #endif /* X86 */ | 53 | #endif /* X86 */ |
53 | 54 | ||
54 | #define BAD_MADT_ENTRY(entry, end) ( \ | 55 | #define BAD_MADT_ENTRY(entry, end) ( \ |
@@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) | |||
482 | */ | 483 | */ |
483 | #ifdef CONFIG_ACPI_HOTPLUG_CPU | 484 | #ifdef CONFIG_ACPI_HOTPLUG_CPU |
484 | 485 | ||
486 | static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) | ||
487 | { | ||
488 | #ifdef CONFIG_ACPI_NUMA | ||
489 | int nid; | ||
490 | |||
491 | nid = acpi_get_node(handle); | ||
492 | if (nid == -1 || !node_online(nid)) | ||
493 | return; | ||
494 | #ifdef CONFIG_X86_64 | ||
495 | apicid_to_node[physid] = nid; | ||
496 | numa_set_node(cpu, nid); | ||
497 | #else /* CONFIG_X86_32 */ | ||
498 | apicid_2_node[physid] = nid; | ||
499 | cpu_to_node_map[cpu] = nid; | ||
500 | #endif | ||
501 | |||
502 | #endif | ||
503 | } | ||
504 | |||
485 | static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) | 505 | static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) |
486 | { | 506 | { |
487 | struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; | 507 | struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; |
@@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) | |||
540 | } | 560 | } |
541 | 561 | ||
542 | cpu = cpumask_first(new_map); | 562 | cpu = cpumask_first(new_map); |
563 | acpi_map_cpu2node(handle, cpu, physid); | ||
543 | 564 | ||
544 | *pcpu = cpu; | 565 | *pcpu = cpu; |
545 | retval = 0; | 566 | retval = 0; |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..3307ea8bd43a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
427 | * Calculate the number of big nodes that can be allocated as a result | 427 | * Calculate the number of big nodes that can be allocated as a result |
428 | * of consolidating the remainder. | 428 | * of consolidating the remainder. |
429 | */ | 429 | */ |
430 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / | 430 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / |
431 | FAKE_NODE_MIN_SIZE; | 431 | FAKE_NODE_MIN_SIZE; |
432 | 432 | ||
433 | size &= FAKE_NODE_MIN_HASH_MASK; | 433 | size &= FAKE_NODE_MIN_HASH_MASK; |
@@ -502,77 +502,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
502 | } | 502 | } |
503 | 503 | ||
504 | /* | 504 | /* |
505 | * Splits num_nodes nodes up equally starting at node_start. The return value | 505 | * Returns the end address of a node so that there is at least `size' amount of |
506 | * is the number of nodes split up and addr is adjusted to be at the end of the | 506 | * non-reserved memory or `max_addr' is reached. |
507 | * last node allocated. | ||
508 | */ | 507 | */ |
509 | static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, | 508 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) |
510 | int num_nodes) | ||
511 | { | 509 | { |
512 | unsigned int big; | 510 | u64 end = start + size; |
513 | u64 size; | ||
514 | int i; | ||
515 | |||
516 | if (num_nodes <= 0) | ||
517 | return -1; | ||
518 | if (num_nodes > MAX_NUMNODES) | ||
519 | num_nodes = MAX_NUMNODES; | ||
520 | size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / | ||
521 | num_nodes; | ||
522 | /* | ||
523 | * Calculate the number of big nodes that can be allocated as a result | ||
524 | * of consolidating the leftovers. | ||
525 | */ | ||
526 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / | ||
527 | FAKE_NODE_MIN_SIZE; | ||
528 | |||
529 | /* Round down to nearest FAKE_NODE_MIN_SIZE. */ | ||
530 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
531 | if (!size) { | ||
532 | printk(KERN_ERR "Not enough memory for each node. " | ||
533 | "NUMA emulation disabled.\n"); | ||
534 | return -1; | ||
535 | } | ||
536 | |||
537 | for (i = node_start; i < num_nodes + node_start; i++) { | ||
538 | u64 end = *addr + size; | ||
539 | 511 | ||
540 | if (i < big) | 512 | while (end - start - e820_hole_size(start, end) < size) { |
541 | end += FAKE_NODE_MIN_SIZE; | 513 | end += FAKE_NODE_MIN_SIZE; |
542 | /* | 514 | if (end > max_addr) { |
543 | * The final node can have the remaining system RAM. Other | ||
544 | * nodes receive roughly the same amount of available pages. | ||
545 | */ | ||
546 | if (i == num_nodes + node_start - 1) | ||
547 | end = max_addr; | 515 | end = max_addr; |
548 | else | ||
549 | while (end - *addr - e820_hole_size(*addr, end) < | ||
550 | size) { | ||
551 | end += FAKE_NODE_MIN_SIZE; | ||
552 | if (end > max_addr) { | ||
553 | end = max_addr; | ||
554 | break; | ||
555 | } | ||
556 | } | ||
557 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) | ||
558 | break; | 516 | break; |
517 | } | ||
559 | } | 518 | } |
560 | return i - node_start + 1; | 519 | return end; |
561 | } | 520 | } |
562 | 521 | ||
563 | /* | 522 | /* |
564 | * Splits the remaining system RAM into chunks of size. The remaining memory is | 523 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from |
565 | * always assigned to a final node and can be asymmetric. Returns the number of | 524 | * `addr' to `max_addr'. The return value is the number of nodes allocated. |
566 | * nodes split. | ||
567 | */ | 525 | */ |
568 | static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, | 526 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) |
569 | u64 size) | ||
570 | { | 527 | { |
571 | int i = node_start; | 528 | nodemask_t physnode_mask = NODE_MASK_NONE; |
572 | size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; | 529 | u64 min_size; |
573 | while (!setup_node_range(i++, addr, size, max_addr)) | 530 | int ret = 0; |
574 | ; | 531 | int i; |
575 | return i - node_start; | 532 | |
533 | if (!size) | ||
534 | return -1; | ||
535 | /* | ||
536 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
537 | * increased accordingly if the requested size is too small. This | ||
538 | * creates a uniform distribution of node sizes across the entire | ||
539 | * machine (but not necessarily over physical nodes). | ||
540 | */ | ||
541 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / | ||
542 | MAX_NUMNODES; | ||
543 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
544 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
545 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
546 | FAKE_NODE_MIN_HASH_MASK; | ||
547 | if (size < min_size) { | ||
548 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
549 | size >> 20, min_size >> 20); | ||
550 | size = min_size; | ||
551 | } | ||
552 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
553 | |||
554 | for (i = 0; i < MAX_NUMNODES; i++) | ||
555 | if (physnodes[i].start != physnodes[i].end) | ||
556 | node_set(i, physnode_mask); | ||
557 | /* | ||
558 | * Fill physical nodes with fake nodes of size until there is no memory | ||
559 | * left on any of them. | ||
560 | */ | ||
561 | while (nodes_weight(physnode_mask)) { | ||
562 | for_each_node_mask(i, physnode_mask) { | ||
563 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
564 | u64 end; | ||
565 | |||
566 | end = find_end_of_node(physnodes[i].start, | ||
567 | physnodes[i].end, size); | ||
568 | /* | ||
569 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
570 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
571 | * this one must extend to the boundary. | ||
572 | */ | ||
573 | if (end < dma32_end && dma32_end - end - | ||
574 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
575 | end = dma32_end; | ||
576 | |||
577 | /* | ||
578 | * If there won't be enough non-reserved memory for the | ||
579 | * next node, this one must extend to the end of the | ||
580 | * physical node. | ||
581 | */ | ||
582 | if (physnodes[i].end - end - | ||
583 | e820_hole_size(end, physnodes[i].end) < size) | ||
584 | end = physnodes[i].end; | ||
585 | |||
586 | /* | ||
587 | * Setup the fake node that will be allocated as bootmem | ||
588 | * later. If setup_node_range() returns non-zero, there | ||
589 | * is no more memory available on this physical node. | ||
590 | */ | ||
591 | if (setup_node_range(ret++, &physnodes[i].start, | ||
592 | end - physnodes[i].start, | ||
593 | physnodes[i].end) < 0) | ||
594 | node_clear(i, physnode_mask); | ||
595 | } | ||
596 | } | ||
597 | return ret; | ||
576 | } | 598 | } |
577 | 599 | ||
578 | /* | 600 | /* |
@@ -582,87 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, | |||
582 | static int __init numa_emulation(unsigned long start_pfn, | 604 | static int __init numa_emulation(unsigned long start_pfn, |
583 | unsigned long last_pfn, int acpi, int k8) | 605 | unsigned long last_pfn, int acpi, int k8) |
584 | { | 606 | { |
585 | u64 size, addr = start_pfn << PAGE_SHIFT; | 607 | u64 addr = start_pfn << PAGE_SHIFT; |
586 | u64 max_addr = last_pfn << PAGE_SHIFT; | 608 | u64 max_addr = last_pfn << PAGE_SHIFT; |
587 | int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; | ||
588 | int num_phys_nodes; | 609 | int num_phys_nodes; |
610 | int num_nodes; | ||
611 | int i; | ||
589 | 612 | ||
590 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); | 613 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
591 | /* | 614 | /* |
592 | * If the numa=fake command-line is just a single number N, split the | 615 | * If the numa=fake command-line contains a 'M' or 'G', it represents |
593 | * system RAM into N fake nodes. | 616 | * the fixed node size. Otherwise, if it is just a single number N, |
617 | * split the system RAM into N fake nodes. | ||
594 | */ | 618 | */ |
595 | if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { | 619 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { |
596 | long n = simple_strtol(cmdline, NULL, 0); | 620 | u64 size; |
597 | |||
598 | num_nodes = split_nodes_interleave(addr, max_addr, | ||
599 | num_phys_nodes, n); | ||
600 | if (num_nodes < 0) | ||
601 | return num_nodes; | ||
602 | goto out; | ||
603 | } | ||
604 | 621 | ||
605 | /* Parse the command line. */ | 622 | size = memparse(cmdline, &cmdline); |
606 | for (coeff_flag = 0; ; cmdline++) { | 623 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); |
607 | if (*cmdline && isdigit(*cmdline)) { | 624 | } else { |
608 | num = num * 10 + *cmdline - '0'; | 625 | unsigned long n; |
609 | continue; | 626 | |
610 | } | 627 | n = simple_strtoul(cmdline, NULL, 0); |
611 | if (*cmdline == '*') { | 628 | num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); |
612 | if (num > 0) | ||
613 | coeff = num; | ||
614 | coeff_flag = 1; | ||
615 | } | ||
616 | if (!*cmdline || *cmdline == ',') { | ||
617 | if (!coeff_flag) | ||
618 | coeff = 1; | ||
619 | /* | ||
620 | * Round down to the nearest FAKE_NODE_MIN_SIZE. | ||
621 | * Command-line coefficients are in megabytes. | ||
622 | */ | ||
623 | size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; | ||
624 | if (size) | ||
625 | for (i = 0; i < coeff; i++, num_nodes++) | ||
626 | if (setup_node_range(num_nodes, &addr, | ||
627 | size, max_addr) < 0) | ||
628 | goto done; | ||
629 | if (!*cmdline) | ||
630 | break; | ||
631 | coeff_flag = 0; | ||
632 | coeff = -1; | ||
633 | } | ||
634 | num = 0; | ||
635 | } | ||
636 | done: | ||
637 | if (!num_nodes) | ||
638 | return -1; | ||
639 | /* Fill remainder of system RAM, if appropriate. */ | ||
640 | if (addr < max_addr) { | ||
641 | if (coeff_flag && coeff < 0) { | ||
642 | /* Split remaining nodes into num-sized chunks */ | ||
643 | num_nodes += split_nodes_by_size(&addr, max_addr, | ||
644 | num_nodes, num); | ||
645 | goto out; | ||
646 | } | ||
647 | switch (*(cmdline - 1)) { | ||
648 | case '*': | ||
649 | /* Split remaining nodes into coeff chunks */ | ||
650 | if (coeff <= 0) | ||
651 | break; | ||
652 | num_nodes += split_nodes_equally(&addr, max_addr, | ||
653 | num_nodes, coeff); | ||
654 | break; | ||
655 | case ',': | ||
656 | /* Do not allocate remaining system RAM */ | ||
657 | break; | ||
658 | default: | ||
659 | /* Give one final node */ | ||
660 | setup_node_range(num_nodes, &addr, max_addr - addr, | ||
661 | max_addr); | ||
662 | num_nodes++; | ||
663 | } | ||
664 | } | 629 | } |
665 | out: | 630 | |
631 | if (num_nodes < 0) | ||
632 | return num_nodes; | ||
666 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); | 633 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); |
667 | if (memnode_shift < 0) { | 634 | if (memnode_shift < 0) { |
668 | memnode_shift = 0; | 635 | memnode_shift = 0; |