aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-12-22 20:23:56 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2010-12-23 18:27:16 -0500
commita387e95a49743cf9835c5299ca549232618d8249 (patch)
tree5524dd47c147943722939d756615f1a123189c7a /arch
parentc1c3443c9c5e9be92641029ed229a41563e44506 (diff)
x86, numa: Fix cpu to node mapping for sparse node ids
NUMA boot code assumes that physical node ids start at 0, but the DIMMs that the apic id represents may not be reachable. If this is the case, node 0 is never online and cpus never end up getting appropriately assigned to a node. This causes the cpumask of all online nodes to be empty and machines crash with kernel code assuming online nodes have valid cpus. The fix is to appropriately map all the address ranges for physical nodes and ensure the cpu to node mapping function checks all possible nodes (up to MAX_NUMNODES) instead of simply checking nodes 0-N, where N is the number of physical nodes, for valid address ranges. This requires no longer "compressing" the address ranges of nodes in the physical node map from 0-N, but rather leave indices in physnodes[] to represent the actual node id of the physical node. Accordingly, the topology exported by both amd_get_nodes() and acpi_get_nodes() no longer must return the number of nodes to iterate through; all such iterations will now be to MAX_NUMNODES. This change also passes the end address of system RAM (which may be different from normal operation if mem= is specified on the command line) before the physnodes[] array is populated. ACPI parsed nodes are truncated to fit within the address range that respect the mem= boundaries and even some physical nodes may become unreachable in such cases. When NUMA emulation does succeed, any apicid to node mapping that exists for unreachable nodes are given default values so that proximity domains can still be assigned. This is important for node_distance() to function as desired. Signed-off-by: David Rientjes <rientjes@google.com> LKML-Reference: <alpine.DEB.2.00.1012221702090.3701@chino.kir.corp.google.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/amd_nb.h2
-rw-r--r--arch/x86/mm/amdtopology_64.c9
-rw-r--r--arch/x86/mm/numa_64.c18
-rw-r--r--arch/x86/mm/srat_64.c22
5 files changed, 25 insertions, 29 deletions
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 8288daf72dc9..211ca3f7fd16 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,7 +185,8 @@ struct bootnode;
185 185
186#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
187extern int acpi_numa; 187extern int acpi_numa;
188extern int acpi_get_nodes(struct bootnode *physnodes); 188extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
189 unsigned long end);
189extern int acpi_scan_nodes(unsigned long start, unsigned long end); 190extern int acpi_scan_nodes(unsigned long start, unsigned long end);
190#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 191#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
191 192
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8f6192c1592c..980f22567631 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -14,7 +14,7 @@ extern int amd_scan_nodes(void);
14 14
15#ifdef CONFIG_NUMA_EMU 15#ifdef CONFIG_NUMA_EMU
16extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes); 16extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
17extern int amd_get_nodes(struct bootnode *nodes); 17extern void amd_get_nodes(struct bootnode *nodes);
18#endif 18#endif
19 19
20struct amd_northbridge { 20struct amd_northbridge {
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
index eb5cbb97b68d..0df2623d1039 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -187,17 +187,14 @@ static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
187 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 187 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
188}; 188};
189 189
190int __init amd_get_nodes(struct bootnode *physnodes) 190void __init amd_get_nodes(struct bootnode *physnodes)
191{ 191{
192 int i; 192 int i;
193 int ret = 0;
194 193
195 for_each_node_mask(i, nodes_parsed) { 194 for_each_node_mask(i, nodes_parsed) {
196 physnodes[ret].start = nodes[i].start; 195 physnodes[i].start = nodes[i].start;
197 physnodes[ret].end = nodes[i].end; 196 physnodes[i].end = nodes[i].end;
198 ret++;
199 } 197 }
200 return ret;
201} 198}
202 199
203static int __init find_node_by_addr(unsigned long addr) 200static int __init find_node_by_addr(unsigned long addr)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index dd300c491f1f..3d73201ba347 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -266,25 +266,24 @@ static char *cmdline __initdata;
266static int __init setup_physnodes(unsigned long start, unsigned long end, 266static int __init setup_physnodes(unsigned long start, unsigned long end,
267 int acpi, int amd) 267 int acpi, int amd)
268{ 268{
269 int nr_nodes = 0;
270 int ret = 0; 269 int ret = 0;
271 int i; 270 int i;
272 271
273 memset(physnodes, 0, sizeof(physnodes)); 272 memset(physnodes, 0, sizeof(physnodes));
274#ifdef CONFIG_ACPI_NUMA 273#ifdef CONFIG_ACPI_NUMA
275 if (acpi) 274 if (acpi)
276 nr_nodes = acpi_get_nodes(physnodes); 275 acpi_get_nodes(physnodes, start, end);
277#endif 276#endif
278#ifdef CONFIG_AMD_NUMA 277#ifdef CONFIG_AMD_NUMA
279 if (amd) 278 if (amd)
280 nr_nodes = amd_get_nodes(physnodes); 279 amd_get_nodes(physnodes);
281#endif 280#endif
282 /* 281 /*
283 * Basic sanity checking on the physical node map: there may be errors 282 * Basic sanity checking on the physical node map: there may be errors
284 * if the SRAT or AMD code incorrectly reported the topology or the mem= 283 * if the SRAT or AMD code incorrectly reported the topology or the mem=
285 * kernel parameter is used. 284 * kernel parameter is used.
286 */ 285 */
287 for (i = 0; i < nr_nodes; i++) { 286 for (i = 0; i < MAX_NUMNODES; i++) {
288 if (physnodes[i].start == physnodes[i].end) 287 if (physnodes[i].start == physnodes[i].end)
289 continue; 288 continue;
290 if (physnodes[i].start > end) { 289 if (physnodes[i].start > end) {
@@ -299,17 +298,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
299 physnodes[i].start = start; 298 physnodes[i].start = start;
300 if (physnodes[i].end > end) 299 if (physnodes[i].end > end)
301 physnodes[i].end = end; 300 physnodes[i].end = end;
302 }
303
304 /*
305 * Remove all nodes that have no memory or were truncated because of the
306 * limited address range.
307 */
308 for (i = 0; i < nr_nodes; i++) {
309 if (physnodes[i].start == physnodes[i].end)
310 continue;
311 physnodes[ret].start = physnodes[i].start;
312 physnodes[ret].end = physnodes[i].end;
313 ret++; 301 ret++;
314 } 302 }
315 303
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c48b443706c5..a756bcf3fa48 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -340,17 +340,16 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
340void __init acpi_numa_arch_fixup(void) {} 340void __init acpi_numa_arch_fixup(void) {}
341 341
342#ifdef CONFIG_NUMA_EMU 342#ifdef CONFIG_NUMA_EMU
343int __init acpi_get_nodes(struct bootnode *physnodes) 343void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
344 unsigned long end)
344{ 345{
345 int i; 346 int i;
346 int ret = 0;
347 347
348 for_each_node_mask(i, nodes_parsed) { 348 for_each_node_mask(i, nodes_parsed) {
349 physnodes[ret].start = nodes[i].start; 349 cutoff_node(i, start, end);
350 physnodes[ret].end = nodes[i].end; 350 physnodes[i].start = nodes[i].start;
351 ret++; 351 physnodes[i].end = nodes[i].end;
352 } 352 }
353 return ret;
354} 353}
355#endif /* CONFIG_NUMA_EMU */ 354#endif /* CONFIG_NUMA_EMU */
356 355
@@ -516,6 +515,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
516 fake_apicid_to_node[j] == NUMA_NO_NODE) 515 fake_apicid_to_node[j] == NUMA_NO_NODE)
517 fake_apicid_to_node[j] = i; 516 fake_apicid_to_node[j] = i;
518 } 517 }
518
519 /*
520 * If there are apicid-to-node mappings for physical nodes that do not
521 * have a corresponding emulated node, it should default to a guaranteed
522 * value.
523 */
524 for (i = 0; i < MAX_LOCAL_APIC; i++)
525 if (apicid_to_node[i] != NUMA_NO_NODE &&
526 fake_apicid_to_node[i] == NUMA_NO_NODE)
527 fake_apicid_to_node[i] = 0;
528
519 for (i = 0; i < num_nodes; i++) 529 for (i = 0; i < num_nodes; i++)
520 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); 530 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
521 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); 531 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));