diff options
| -rw-r--r-- | Documentation/x86/x86_64/boot-options.txt | 4 | ||||
| -rw-r--r-- | arch/x86/include/asm/mmzone_64.h | 6 | ||||
| -rw-r--r-- | arch/x86/include/asm/numa_64.h | 5 | ||||
| -rw-r--r-- | arch/x86/mm/numa_64.c | 117 |
4 files changed, 118 insertions, 14 deletions
diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 29a6ff8bc7d3..01150c64aa73 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt | |||
| @@ -166,6 +166,10 @@ NUMA | |||
| 166 | 166 | ||
| 167 | numa=noacpi Don't parse the SRAT table for NUMA setup | 167 | numa=noacpi Don't parse the SRAT table for NUMA setup |
| 168 | 168 | ||
| 169 | numa=fake=<size>[MG] | ||
| 170 | If given as a memory unit, fills all system RAM with nodes of | ||
| 171 | size interleaved over physical nodes. | ||
| 172 | |||
| 169 | numa=fake=CMDLINE | 173 | numa=fake=CMDLINE |
| 170 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the | 174 | If a number, fakes CMDLINE nodes and ignores NUMA setup of the |
| 171 | actual machine. Otherwise, system memory is configured | 175 | actual machine. Otherwise, system memory is configured |
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h | |||
| @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) | |||
| 39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 39 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
| 40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ | 40 | #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ |
| 41 | NODE_DATA(nid)->node_spanned_pages) | 41 | NODE_DATA(nid)->node_spanned_pages) |
| 42 | |||
| 43 | #ifdef CONFIG_NUMA_EMU | ||
| 44 | #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) | ||
| 45 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
| 46 | #endif | ||
| 47 | |||
| 48 | #endif | 42 | #endif |
| 49 | #endif /* _ASM_X86_MMZONE_64_H */ | 43 | #endif /* _ASM_X86_MMZONE_64_H */ |
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h | |||
| @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); | |||
| 36 | extern void __cpuinit numa_clear_node(int cpu); | 36 | extern void __cpuinit numa_clear_node(int cpu); |
| 37 | extern void __cpuinit numa_add_cpu(int cpu); | 37 | extern void __cpuinit numa_add_cpu(int cpu); |
| 38 | extern void __cpuinit numa_remove_cpu(int cpu); | 38 | extern void __cpuinit numa_remove_cpu(int cpu); |
| 39 | |||
| 40 | #ifdef CONFIG_NUMA_EMU | ||
| 41 | #define FAKE_NODE_MIN_SIZE ((u64)64 << 20) | ||
| 42 | #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) | ||
| 43 | #endif /* CONFIG_NUMA_EMU */ | ||
| 39 | #else | 44 | #else |
| 40 | static inline void init_cpu_to_node(void) { } | 45 | static inline void init_cpu_to_node(void) { } |
| 41 | static inline void numa_set_node(int cpu, int node) { } | 46 | static inline void numa_set_node(int cpu, int node) { } |
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ecbe0ca0dfc..c47c78ba3aca 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
| @@ -502,6 +502,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, | |||
| 502 | } | 502 | } |
| 503 | 503 | ||
| 504 | /* | 504 | /* |
| 505 | * Returns the end address of a node so that there is at least `size' amount of | ||
| 506 | * non-reserved memory or `max_addr' is reached. | ||
| 507 | */ | ||
| 508 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
| 509 | { | ||
| 510 | u64 end = start + size; | ||
| 511 | |||
| 512 | while (end - start - e820_hole_size(start, end) < size) { | ||
| 513 | end += FAKE_NODE_MIN_SIZE; | ||
| 514 | if (end > max_addr) { | ||
| 515 | end = max_addr; | ||
| 516 | break; | ||
| 517 | } | ||
| 518 | } | ||
| 519 | return end; | ||
| 520 | } | ||
| 521 | |||
| 522 | /* | ||
| 523 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
| 524 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
| 525 | */ | ||
| 526 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | ||
| 527 | { | ||
| 528 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
| 529 | u64 min_size; | ||
| 530 | int ret = 0; | ||
| 531 | int i; | ||
| 532 | |||
| 533 | if (!size) | ||
| 534 | return -1; | ||
| 535 | /* | ||
| 536 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
| 537 | * increased accordingly if the requested size is too small. This | ||
| 538 | * creates a uniform distribution of node sizes across the entire | ||
| 539 | * machine (but not necessarily over physical nodes). | ||
| 540 | */ | ||
| 541 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / | ||
| 542 | MAX_NUMNODES; | ||
| 543 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
| 544 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
| 545 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
| 546 | FAKE_NODE_MIN_HASH_MASK; | ||
| 547 | if (size < min_size) { | ||
| 548 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
| 549 | size >> 20, min_size >> 20); | ||
| 550 | size = min_size; | ||
| 551 | } | ||
| 552 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
| 553 | |||
| 554 | for (i = 0; i < MAX_NUMNODES; i++) | ||
| 555 | if (physnodes[i].start != physnodes[i].end) | ||
| 556 | node_set(i, physnode_mask); | ||
| 557 | /* | ||
| 558 | * Fill physical nodes with fake nodes of size until there is no memory | ||
| 559 | * left on any of them. | ||
| 560 | */ | ||
| 561 | while (nodes_weight(physnode_mask)) { | ||
| 562 | for_each_node_mask(i, physnode_mask) { | ||
| 563 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
| 564 | u64 end; | ||
| 565 | |||
| 566 | end = find_end_of_node(physnodes[i].start, | ||
| 567 | physnodes[i].end, size); | ||
| 568 | /* | ||
| 569 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
| 570 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
| 571 | * this one must extend to the boundary. | ||
| 572 | */ | ||
| 573 | if (end < dma32_end && dma32_end - end - | ||
| 574 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
| 575 | end = dma32_end; | ||
| 576 | |||
| 577 | /* | ||
| 578 | * If there won't be enough non-reserved memory for the | ||
| 579 | * next node, this one must extend to the end of the | ||
| 580 | * physical node. | ||
| 581 | */ | ||
| 582 | if (physnodes[i].end - end - | ||
| 583 | e820_hole_size(end, physnodes[i].end) < size) | ||
| 584 | end = physnodes[i].end; | ||
| 585 | |||
| 586 | /* | ||
| 587 | * Setup the fake node that will be allocated as bootmem | ||
| 588 | * later. If setup_node_range() returns non-zero, there | ||
| 589 | * is no more memory available on this physical node. | ||
| 590 | */ | ||
| 591 | if (setup_node_range(ret++, &physnodes[i].start, | ||
| 592 | end - physnodes[i].start, | ||
| 593 | physnodes[i].end) < 0) | ||
| 594 | node_clear(i, physnode_mask); | ||
| 595 | } | ||
| 596 | } | ||
| 597 | return ret; | ||
| 598 | } | ||
| 599 | |||
| 600 | /* | ||
| 505 | * Splits num_nodes nodes up equally starting at node_start. The return value | 601 | * Splits num_nodes nodes up equally starting at node_start. The return value |
| 506 | * is the number of nodes split up and addr is adjusted to be at the end of the | 602 | * is the number of nodes split up and addr is adjusted to be at the end of the |
| 507 | * last node allocated. | 603 | * last node allocated. |
| @@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, | |||
| 546 | if (i == num_nodes + node_start - 1) | 642 | if (i == num_nodes + node_start - 1) |
| 547 | end = max_addr; | 643 | end = max_addr; |
| 548 | else | 644 | else |
| 549 | while (end - *addr - e820_hole_size(*addr, end) < | 645 | end = find_end_of_node(*addr, max_addr, size); |
| 550 | size) { | ||
| 551 | end += FAKE_NODE_MIN_SIZE; | ||
| 552 | if (end > max_addr) { | ||
| 553 | end = max_addr; | ||
| 554 | break; | ||
| 555 | } | ||
| 556 | } | ||
| 557 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) | 646 | if (setup_node_range(i, addr, end - *addr, max_addr) < 0) |
| 558 | break; | 647 | break; |
| 559 | } | 648 | } |
| @@ -589,6 +678,18 @@ static int __init numa_emulation(unsigned long start_pfn, | |||
| 589 | 678 | ||
| 590 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); | 679 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); |
| 591 | /* | 680 | /* |
| 681 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
| 682 | * the fixed node size. | ||
| 683 | */ | ||
| 684 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { | ||
| 685 | size = memparse(cmdline, &cmdline); | ||
| 686 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); | ||
| 687 | if (num_nodes < 0) | ||
| 688 | return num_nodes; | ||
| 689 | goto out; | ||
| 690 | } | ||
| 691 | |||
| 692 | /* | ||
| 592 | * If the numa=fake command-line is just a single number N, split the | 693 | * If the numa=fake command-line is just a single number N, split the |
| 593 | * system RAM into N fake nodes. | 694 | * system RAM into N fake nodes. |
| 594 | */ | 695 | */ |
