diff options
-rw-r--r-- | Documentation/x86/x86_64/boot-options.txt | 4 | ||||
-rw-r--r-- | arch/x86/include/asm/mmzone_64.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/numa_64.h | 5 | ||||
-rw-r--r-- | arch/x86/mm/numa_64.c | 117 |
4 files changed, 118 insertions, 14 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 29a6ff8bc7d3..01150c64aa73 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
@@ -166,6 +166,10 @@ NUMA | |||
166 | 166 | ||
167 | numa=noacpi Don't parse the SRAT table for NUMA setup | 167 | numa=noacpi Don't parse the SRAT table for NUMA setup |
168 | 168 | ||
169 | numa=fake=<size>[MG] | ||
170 | If given as a memory unit, fills all system RAM with nodes of | ||
171 | size interleaved over physical nodes. | ||
172 | |||
169 | numa=fake=CMDLINE | 173 | numa=fake=CMDLINE |
170 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the | 174 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the |
171 | actual machine. Otherwise, system memory is configured | 175 | actual machine. Otherwise, system memory is configured |
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h | |||
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) | |||
39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ | 40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ |
41 | NODE_DATA(nid)->node_spanned_pages) | 41 | NODE_DATA(nid)->node_spanned_pages) |
42 | |||
43 | #ifdef CONFIG_NUMA_EMU | ||
44 | #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) | ||
45 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
46 | #endif | ||
47 | |||
48 | #endif | 42 | #endif |
49 | #endif /* _ASM_X86_MMZONE_64_H */ | 43 | #endif /* _ASM_X86_MMZONE_64_H */ |
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h | |||
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); | |||
36 | extern void __cpuinit numa_clear_node(int cpu); | 36 | extern void __cpuinit numa_clear_node(int cpu); |
37 | extern void __cpuinit numa_add_cpu(int cpu); | 37 | extern void __cpuinit numa_add_cpu(int cpu); |
38 | extern void __cpuinit numa_remove_cpu(int cpu); | 38 | extern void __cpuinit numa_remove_cpu(int cpu); |
39 | |||
40 | #ifdef CONFIG_NUMA_EMU | ||
41 | #define FAKE_NODE_MIN_SIZE ((u64)64 << 20) | ||
42 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
43 | #endif /* CONFIG_NUMA_EMU */ | ||
39 | #else | 44 | #else |
40 | static inline void init_cpu_to_node(void) { } | 45 | static inline void init_cpu_to_node(void) { } |
41 | static inline void numa_set_node(int cpu, int node) { } | 46 | static inline void numa_set_node(int cpu, int node) { } |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ecbe0ca0dfc..c47c78ba3aca 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -502,6 +502,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
502 | } | 502 | } |
503 | 503 | ||
504 | /* | 504 | /* |
505 | * Returns the end address of a node so that there is at least `size' amount of | ||
506 | * non-reserved memory or `max_addr' is reached. | ||
507 | */ | ||
508 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
509 | { | ||
510 | u64 end = start + size; | ||
511 | |||
512 | while (end - start - e820_hole_size(start, end) < size) { | ||
513 | end += FAKE_NODE_MIN_SIZE; | ||
514 | if (end > max_addr) { | ||
515 | end = max_addr; | ||
516 | break; | ||
517 | } | ||
518 | } | ||
519 | return end; | ||
520 | } | ||
521 | |||
522 | /* | ||
523 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
524 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
525 | */ | ||
526 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | ||
527 | { | ||
528 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
529 | u64 min_size; | ||
530 | int ret = 0; | ||
531 | int i; | ||
532 | |||
533 | if (!size) | ||
534 | return -1; | ||
535 | /* | ||
536 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
537 | * increased accordingly if the requested size is too small. This | ||
538 | * creates a uniform distribution of node sizes across the entire | ||
539 | * machine (but not necessarily over physical nodes). | ||
540 | */ | ||
541 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / | ||
542 | MAX_NUMNODES; | ||
543 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
544 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
545 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
546 | FAKE_NODE_MIN_HASH_MASK; | ||
547 | if (size < min_size) { | ||
548 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
549 | size >> 20, min_size >> 20); | ||
550 | size = min_size; | ||
551 | } | ||
552 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
553 | |||
554 | for (i = 0; i < MAX_NUMNODES; i++) | ||
555 | if (physnodes[i].start != physnodes[i].end) | ||
556 | node_set(i, physnode_mask); | ||
557 | /* | ||
558 | * Fill physical nodes with fake nodes of size until there is no memory | ||
559 | * left on any of them. | ||
560 | */ | ||
561 | while (nodes_weight(physnode_mask)) { | ||
562 | for_each_node_mask(i, physnode_mask) { | ||
563 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
564 | u64 end; | ||
565 | |||
566 | end = find_end_of_node(physnodes[i].start, | ||
567 | physnodes[i].end, size); | ||
568 | /* | ||
569 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
570 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
571 | * this one must extend to the boundary. | ||
572 | */ | ||
573 | if (end < dma32_end && dma32_end - end - | ||
574 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
575 | end = dma32_end; | ||
576 | |||
577 | /* | ||
578 | * If there won't be enough non-reserved memory for the | ||
579 | * next node, this one must extend to the end of the | ||
580 | * physical node. | ||
581 | */ | ||
582 | if (physnodes[i].end - end - | ||
583 | e820_hole_size(end, physnodes[i].end) < size) | ||
584 | end = physnodes[i].end; | ||
585 | |||
586 | /* | ||
587 | * Setup the fake node that will be allocated as bootmem | ||
588 | * later. If setup_node_range() returns non-zero, there | ||
589 | * is no more memory available on this physical node. | ||
590 | */ | ||
591 | if (setup_node_range(ret++, &physnodes[i].start, | ||
592 | end - physnodes[i].start, | ||
593 | physnodes[i].end) < 0) | ||
594 | node_clear(i, physnode_mask); | ||
595 | } | ||
596 | } | ||
597 | return ret; | ||
598 | } | ||
599 | |||
600 | /* | ||
505 | * Splits num_nodes nodes up equally starting at node_start. The return value | 601 | * Splits num_nodes nodes up equally starting at node_start. The return value |
506 | * is the number of nodes split up and addr is adjusted to be at the end of the | 602 | * is the number of nodes split up and addr is adjusted to be at the end of the |
507 | * last node allocated. | 603 | * last node allocated. |
@@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, | |||
546 | if (i == num_nodes + node_start - 1) | 642 | if (i == num_nodes + node_start - 1) |
547 | end = max_addr; | 643 | end = max_addr; |
548 | else | 644 | else |
549 | while (end - *addr - e820_hole_size(*addr, end) < | 645 | end = find_end_of_node(*addr, max_addr, size); |
550 | size) { | ||
551 | end += FAKE_NODE_MIN_SIZE; | ||
552 | if (end > max_addr) { | ||
553 | end = max_addr; | ||
554 | break; | ||
555 | } | ||
556 | } | ||
557 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) | 646 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) |
558 | break; | 647 | break; |
559 | } | 648 | } |
@@ -589,6 +678,18 @@ static int __init numa_emulation(unsigned long start_pfn, | |||
589 | 678 | ||
590 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); | 679 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
591 | /* | 680 | /* |
681 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
682 | * the fixed node size. | ||
683 | */ | ||
684 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { | ||
685 | size = memparse(cmdline, &cmdline); | ||
686 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); | ||
687 | if (num_nodes < 0) | ||
688 | return num_nodes; | ||
689 | goto out; | ||
690 | } | ||
691 | |||
692 | /* | ||
592 | * If the numa=fake command-line is just a single number N, split the | 693 | * If the numa=fake command-line is just a single number N, split the |
593 | * system RAM into N fake nodes. | 694 | * system RAM into N fake nodes. |
594 | */ | 695 | */ |