aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-12-22 20:23:54 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2010-12-23 18:27:15 -0500
commitc1c3443c9c5e9be92641029ed229a41563e44506 (patch)
tree44094a0e5430f162ccfe17cbd0d45ada361c2f9c /arch
parentf51bf3073a145a5b3263fd882c52d6ec04b687da (diff)
x86, numa: Fake node-to-cpumask for NUMA emulation
It's necessary to fake the node-to-cpumask mapping so that an emulated node ID returns a cpumask that includes all cpus that have affinity to the memory it represents. This is a little intrusive because it requires knowledge of the physical topology of the system. setup_physnodes() gives us that information, but since NUMA emulation ends up altering the physnodes array, it's necessary to reset it before cpus are brought online. Accordingly, the physnodes array is moved out of init.data and into cpuinit.data since it will be needed on cpuup callbacks. This works regardless of whether numa=fake is used on the command line, or the setup of the fake node succeeds or fails. The physnodes array always contains the physical topology of the machine if CONFIG_NUMA_EMU is enabled and can be used to setup the correct node-to-cpumask mappings in all cases since setup_physnodes() is called whenever the array needs to be repopulated with the correct data. To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are rewritten for CONFIG_NUMA_EMU so that we first find the physical node to which each cpu has local affinity, then iterate through all online nodes to find the emulated nodes that have local affinity to that physical node, and then finally map the cpu to each of those emulated nodes. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/mm/numa_64.c99
1 files changed, 79 insertions, 20 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index cc390f3a1bde..dd300c491f1f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -260,7 +260,7 @@ void __init numa_init_array(void)
260#ifdef CONFIG_NUMA_EMU 260#ifdef CONFIG_NUMA_EMU
261/* Numa emulation */ 261/* Numa emulation */
262static struct bootnode nodes[MAX_NUMNODES] __initdata; 262static struct bootnode nodes[MAX_NUMNODES] __initdata;
263static struct bootnode physnodes[MAX_NUMNODES] __initdata; 263static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
264static char *cmdline __initdata; 264static char *cmdline __initdata;
265 265
266static int __init setup_physnodes(unsigned long start, unsigned long end, 266static int __init setup_physnodes(unsigned long start, unsigned long end,
@@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
270 int ret = 0; 270 int ret = 0;
271 int i; 271 int i;
272 272
273 memset(physnodes, 0, sizeof(physnodes));
273#ifdef CONFIG_ACPI_NUMA 274#ifdef CONFIG_ACPI_NUMA
274 if (acpi) 275 if (acpi)
275 nr_nodes = acpi_get_nodes(physnodes); 276 nr_nodes = acpi_get_nodes(physnodes);
@@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
370 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr 371 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
371 * to max_addr. The return value is the number of nodes allocated. 372 * to max_addr. The return value is the number of nodes allocated.
372 */ 373 */
373static int __init split_nodes_interleave(u64 addr, u64 max_addr, 374static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
374 int nr_phys_nodes, int nr_nodes)
375{ 375{
376 nodemask_t physnode_mask = NODE_MASK_NONE; 376 nodemask_t physnode_mask = NODE_MASK_NONE;
377 u64 size; 377 u64 size;
@@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
402 return -1; 402 return -1;
403 } 403 }
404 404
405 for (i = 0; i < nr_phys_nodes; i++) 405 for (i = 0; i < MAX_NUMNODES; i++)
406 if (physnodes[i].start != physnodes[i].end) 406 if (physnodes[i].start != physnodes[i].end)
407 node_set(i, physnode_mask); 407 node_set(i, physnode_mask);
408 408
@@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn,
571{ 571{
572 u64 addr = start_pfn << PAGE_SHIFT; 572 u64 addr = start_pfn << PAGE_SHIFT;
573 u64 max_addr = last_pfn << PAGE_SHIFT; 573 u64 max_addr = last_pfn << PAGE_SHIFT;
574 int num_phys_nodes;
575 int num_nodes; 574 int num_nodes;
576 int i; 575 int i;
577 576
578 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
579 /* 577 /*
580 * If the numa=fake command-line contains a 'M' or 'G', it represents 578 * If the numa=fake command-line contains a 'M' or 'G', it represents
581 * the fixed node size. Otherwise, if it is just a single number N, 579 * the fixed node size. Otherwise, if it is just a single number N,
@@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn,
590 unsigned long n; 588 unsigned long n;
591 589
592 n = simple_strtoul(cmdline, NULL, 0); 590 n = simple_strtoul(cmdline, NULL, 0);
593 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); 591 num_nodes = split_nodes_interleave(addr, max_addr, n);
594 } 592 }
595 593
596 if (num_nodes < 0) 594 if (num_nodes < 0)
@@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn,
613 nodes[i].end >> PAGE_SHIFT); 611 nodes[i].end >> PAGE_SHIFT);
614 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 612 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
615 } 613 }
614 setup_physnodes(addr, max_addr, acpi, amd);
616 fake_physnodes(acpi, amd, num_nodes); 615 fake_physnodes(acpi, amd, num_nodes);
617 numa_init_array(); 616 numa_init_array();
618 return 0; 617 return 0;
@@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
628 nodes_clear(node_online_map); 627 nodes_clear(node_online_map);
629 628
630#ifdef CONFIG_NUMA_EMU 629#ifdef CONFIG_NUMA_EMU
630 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
631 acpi, amd);
631 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) 632 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
632 return; 633 return;
634 setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
635 acpi, amd);
633 nodes_clear(node_possible_map); 636 nodes_clear(node_possible_map);
634 nodes_clear(node_online_map); 637 nodes_clear(node_online_map);
635#endif 638#endif
@@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu)
785 788
786#ifndef CONFIG_DEBUG_PER_CPU_MAPS 789#ifndef CONFIG_DEBUG_PER_CPU_MAPS
787 790
791#ifndef CONFIG_NUMA_EMU
788void __cpuinit numa_add_cpu(int cpu) 792void __cpuinit numa_add_cpu(int cpu)
789{ 793{
790 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 794 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu)
794{ 798{
795 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); 799 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
796} 800}
801#else
802void __cpuinit numa_add_cpu(int cpu)
803{
804 unsigned long addr;
805 u16 apicid;
806 int physnid;
807 int nid = NUMA_NO_NODE;
808
809 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
810 if (apicid != BAD_APICID)
811 nid = apicid_to_node[apicid];
812 if (nid == NUMA_NO_NODE)
813 nid = early_cpu_to_node(cpu);
814 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
815
816 /*
817 * Use the starting address of the emulated node to find which physical
818 * node it is allocated on.
819 */
820 addr = node_start_pfn(nid) << PAGE_SHIFT;
821 for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
822 if (addr >= physnodes[physnid].start &&
823 addr < physnodes[physnid].end)
824 break;
825
826 /*
827 * Map the cpu to each emulated node that is allocated on the physical
828 * node of the cpu's apic id.
829 */
830 for_each_online_node(nid) {
831 addr = node_start_pfn(nid) << PAGE_SHIFT;
832 if (addr >= physnodes[physnid].start &&
833 addr < physnodes[physnid].end)
834 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
835 }
836}
837
838void __cpuinit numa_remove_cpu(int cpu)
839{
840 int i;
841
842 for_each_online_node(i)
843 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
844}
845#endif /* !CONFIG_NUMA_EMU */
797 846
798#else /* CONFIG_DEBUG_PER_CPU_MAPS */ 847#else /* CONFIG_DEBUG_PER_CPU_MAPS */
799 848
@@ -805,22 +854,32 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
805 int node = early_cpu_to_node(cpu); 854 int node = early_cpu_to_node(cpu);
806 struct cpumask *mask; 855 struct cpumask *mask;
807 char buf[64]; 856 char buf[64];
857 int i;
808 858
809 mask = node_to_cpumask_map[node]; 859 for_each_online_node(i) {
810 if (mask == NULL) { 860 unsigned long addr;
811 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
812 dump_stack();
813 return;
814 }
815 861
816 if (enable) 862 addr = node_start_pfn(i) << PAGE_SHIFT;
817 cpumask_set_cpu(cpu, mask); 863 if (addr < physnodes[node].start ||
818 else 864 addr >= physnodes[node].end)
819 cpumask_clear_cpu(cpu, mask); 865 continue;
866 mask = node_to_cpumask_map[node];
867 if (mask == NULL) {
868 pr_err("node_to_cpumask_map[%i] NULL\n", i);
869 dump_stack();
870 return;
871 }
872
873 if (enable)
874 cpumask_set_cpu(cpu, mask);
875 else
876 cpumask_clear_cpu(cpu, mask);
820 877
821 cpulist_scnprintf(buf, sizeof(buf), mask); 878 cpulist_scnprintf(buf, sizeof(buf), mask);
822 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", 879 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
823 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); 880 enable ? "numa_add_cpu" : "numa_remove_cpu",
881 cpu, node, buf);
882 }
824} 883}
825 884
826void __cpuinit numa_add_cpu(int cpu) 885void __cpuinit numa_add_cpu(int cpu)