diff options
Diffstat (limited to 'arch/x86/mm/srat_64.c')
-rw-r--r-- | arch/x86/mm/srat_64.c | 367 |
1 files changed, 21 insertions, 346 deletions
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 603d285d1daa..8e9d3394f6d4 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c | |||
@@ -26,88 +26,34 @@ | |||
26 | 26 | ||
27 | int acpi_numa __initdata; | 27 | int acpi_numa __initdata; |
28 | 28 | ||
29 | static struct acpi_table_slit *acpi_slit; | ||
30 | |||
31 | static nodemask_t nodes_parsed __initdata; | ||
32 | static nodemask_t cpu_nodes_parsed __initdata; | ||
33 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
34 | static struct bootnode nodes_add[MAX_NUMNODES]; | 29 | static struct bootnode nodes_add[MAX_NUMNODES]; |
35 | 30 | ||
36 | static int num_node_memblks __initdata; | ||
37 | static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; | ||
38 | static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; | ||
39 | |||
40 | static __init int setup_node(int pxm) | 31 | static __init int setup_node(int pxm) |
41 | { | 32 | { |
42 | return acpi_map_pxm_to_node(pxm); | 33 | return acpi_map_pxm_to_node(pxm); |
43 | } | 34 | } |
44 | 35 | ||
45 | static __init int conflicting_memblks(unsigned long start, unsigned long end) | ||
46 | { | ||
47 | int i; | ||
48 | for (i = 0; i < num_node_memblks; i++) { | ||
49 | struct bootnode *nd = &node_memblk_range[i]; | ||
50 | if (nd->start == nd->end) | ||
51 | continue; | ||
52 | if (nd->end > start && nd->start < end) | ||
53 | return memblk_nodeid[i]; | ||
54 | if (nd->end == end && nd->start == start) | ||
55 | return memblk_nodeid[i]; | ||
56 | } | ||
57 | return -1; | ||
58 | } | ||
59 | |||
60 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
61 | { | ||
62 | struct bootnode *nd = &nodes[i]; | ||
63 | |||
64 | if (nd->start < start) { | ||
65 | nd->start = start; | ||
66 | if (nd->end < nd->start) | ||
67 | nd->start = nd->end; | ||
68 | } | ||
69 | if (nd->end > end) { | ||
70 | nd->end = end; | ||
71 | if (nd->start > nd->end) | ||
72 | nd->start = nd->end; | ||
73 | } | ||
74 | } | ||
75 | |||
76 | static __init void bad_srat(void) | 36 | static __init void bad_srat(void) |
77 | { | 37 | { |
78 | int i; | ||
79 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | 38 | printk(KERN_ERR "SRAT: SRAT not used.\n"); |
80 | acpi_numa = -1; | 39 | acpi_numa = -1; |
81 | for (i = 0; i < MAX_LOCAL_APIC; i++) | 40 | memset(nodes_add, 0, sizeof(nodes_add)); |
82 | apicid_to_node[i] = NUMA_NO_NODE; | ||
83 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
84 | nodes[i].start = nodes[i].end = 0; | ||
85 | nodes_add[i].start = nodes_add[i].end = 0; | ||
86 | } | ||
87 | remove_all_active_ranges(); | ||
88 | } | 41 | } |
89 | 42 | ||
90 | static __init inline int srat_disabled(void) | 43 | static __init inline int srat_disabled(void) |
91 | { | 44 | { |
92 | return numa_off || acpi_numa < 0; | 45 | return acpi_numa < 0; |
93 | } | 46 | } |
94 | 47 | ||
95 | /* Callback for SLIT parsing */ | 48 | /* Callback for SLIT parsing */ |
96 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | 49 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) |
97 | { | 50 | { |
98 | unsigned length; | 51 | int i, j; |
99 | unsigned long phys; | ||
100 | |||
101 | length = slit->header.length; | ||
102 | phys = memblock_find_in_range(0, max_pfn_mapped<<PAGE_SHIFT, length, | ||
103 | PAGE_SIZE); | ||
104 | |||
105 | if (phys == MEMBLOCK_ERROR) | ||
106 | panic(" Can not save slit!\n"); | ||
107 | 52 | ||
108 | acpi_slit = __va(phys); | 53 | for (i = 0; i < slit->locality_count; i++) |
109 | memcpy(acpi_slit, slit, length); | 54 | for (j = 0; j < slit->locality_count; j++) |
110 | memblock_x86_reserve_range(phys, phys + length, "ACPI SLIT"); | 55 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), |
56 | slit->entry[slit->locality_count * i + j]); | ||
111 | } | 57 | } |
112 | 58 | ||
113 | /* Callback for Proximity Domain -> x2APIC mapping */ | 59 | /* Callback for Proximity Domain -> x2APIC mapping */ |
@@ -138,8 +84,8 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | |||
138 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | 84 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); |
139 | return; | 85 | return; |
140 | } | 86 | } |
141 | apicid_to_node[apic_id] = node; | 87 | set_apicid_to_node(apic_id, node); |
142 | node_set(node, cpu_nodes_parsed); | 88 | node_set(node, numa_nodes_parsed); |
143 | acpi_numa = 1; | 89 | acpi_numa = 1; |
144 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", | 90 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", |
145 | pxm, apic_id, node); | 91 | pxm, apic_id, node); |
@@ -178,8 +124,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | |||
178 | return; | 124 | return; |
179 | } | 125 | } |
180 | 126 | ||
181 | apicid_to_node[apic_id] = node; | 127 | set_apicid_to_node(apic_id, node); |
182 | node_set(node, cpu_nodes_parsed); | 128 | node_set(node, numa_nodes_parsed); |
183 | acpi_numa = 1; | 129 | acpi_numa = 1; |
184 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", | 130 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", |
185 | pxm, apic_id, node); | 131 | pxm, apic_id, node); |
@@ -241,7 +187,7 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
241 | } | 187 | } |
242 | 188 | ||
243 | if (changed) { | 189 | if (changed) { |
244 | node_set(node, cpu_nodes_parsed); | 190 | node_set(node, numa_nodes_parsed); |
245 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", | 191 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", |
246 | nd->start, nd->end); | 192 | nd->start, nd->end); |
247 | } | 193 | } |
@@ -251,10 +197,8 @@ update_nodes_add(int node, unsigned long start, unsigned long end) | |||
251 | void __init | 197 | void __init |
252 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | 198 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) |
253 | { | 199 | { |
254 | struct bootnode *nd, oldnode; | ||
255 | unsigned long start, end; | 200 | unsigned long start, end; |
256 | int node, pxm; | 201 | int node, pxm; |
257 | int i; | ||
258 | 202 | ||
259 | if (srat_disabled()) | 203 | if (srat_disabled()) |
260 | return; | 204 | return; |
@@ -276,300 +220,31 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | |||
276 | bad_srat(); | 220 | bad_srat(); |
277 | return; | 221 | return; |
278 | } | 222 | } |
279 | i = conflicting_memblks(start, end); | 223 | |
280 | if (i == node) { | 224 | if (numa_add_memblk(node, start, end) < 0) { |
281 | printk(KERN_WARNING | ||
282 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
283 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
284 | } else if (i >= 0) { | ||
285 | printk(KERN_ERR | ||
286 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
287 | pxm, start, end, node_to_pxm(i), | ||
288 | nodes[i].start, nodes[i].end); | ||
289 | bad_srat(); | 225 | bad_srat(); |
290 | return; | 226 | return; |
291 | } | 227 | } |
292 | nd = &nodes[node]; | ||
293 | oldnode = *nd; | ||
294 | if (!node_test_and_set(node, nodes_parsed)) { | ||
295 | nd->start = start; | ||
296 | nd->end = end; | ||
297 | } else { | ||
298 | if (start < nd->start) | ||
299 | nd->start = start; | ||
300 | if (nd->end < end) | ||
301 | nd->end = end; | ||
302 | } | ||
303 | 228 | ||
304 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, | 229 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, |
305 | start, end); | 230 | start, end); |
306 | 231 | ||
307 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { | 232 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) |
308 | update_nodes_add(node, start, end); | 233 | update_nodes_add(node, start, end); |
309 | /* restore nodes[node] */ | ||
310 | *nd = oldnode; | ||
311 | if ((nd->start | nd->end) == 0) | ||
312 | node_clear(node, nodes_parsed); | ||
313 | } | ||
314 | |||
315 | node_memblk_range[num_node_memblks].start = start; | ||
316 | node_memblk_range[num_node_memblks].end = end; | ||
317 | memblk_nodeid[num_node_memblks] = node; | ||
318 | num_node_memblks++; | ||
319 | } | ||
320 | |||
321 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
322 | Make sure the PXMs cover all memory. */ | ||
323 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
324 | { | ||
325 | int i; | ||
326 | unsigned long pxmram, e820ram; | ||
327 | |||
328 | pxmram = 0; | ||
329 | for_each_node_mask(i, nodes_parsed) { | ||
330 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
331 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
332 | pxmram += e - s; | ||
333 | pxmram -= __absent_pages_in_range(i, s, e); | ||
334 | if ((long)pxmram < 0) | ||
335 | pxmram = 0; | ||
336 | } | ||
337 | |||
338 | e820ram = max_pfn - (memblock_x86_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); | ||
339 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
340 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { | ||
341 | printk(KERN_ERR | ||
342 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
343 | (pxmram << PAGE_SHIFT) >> 20, | ||
344 | (e820ram << PAGE_SHIFT) >> 20); | ||
345 | return 0; | ||
346 | } | ||
347 | return 1; | ||
348 | } | 234 | } |
349 | 235 | ||
350 | void __init acpi_numa_arch_fixup(void) {} | 236 | void __init acpi_numa_arch_fixup(void) {} |
351 | 237 | ||
352 | #ifdef CONFIG_NUMA_EMU | 238 | int __init x86_acpi_numa_init(void) |
353 | void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start, | ||
354 | unsigned long end) | ||
355 | { | ||
356 | int i; | ||
357 | |||
358 | for_each_node_mask(i, nodes_parsed) { | ||
359 | cutoff_node(i, start, end); | ||
360 | physnodes[i].start = nodes[i].start; | ||
361 | physnodes[i].end = nodes[i].end; | ||
362 | } | ||
363 | } | ||
364 | #endif /* CONFIG_NUMA_EMU */ | ||
365 | |||
366 | /* Use the information discovered above to actually set up the nodes. */ | ||
367 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
368 | { | 239 | { |
369 | int i; | 240 | int ret; |
370 | |||
371 | if (acpi_numa <= 0) | ||
372 | return -1; | ||
373 | |||
374 | /* First clean up the node list */ | ||
375 | for (i = 0; i < MAX_NUMNODES; i++) | ||
376 | cutoff_node(i, start, end); | ||
377 | |||
378 | /* | ||
379 | * Join together blocks on the same node, holes between | ||
380 | * which don't overlap with memory on other nodes. | ||
381 | */ | ||
382 | for (i = 0; i < num_node_memblks; ++i) { | ||
383 | int j, k; | ||
384 | |||
385 | for (j = i + 1; j < num_node_memblks; ++j) { | ||
386 | unsigned long start, end; | ||
387 | |||
388 | if (memblk_nodeid[i] != memblk_nodeid[j]) | ||
389 | continue; | ||
390 | start = min(node_memblk_range[i].end, | ||
391 | node_memblk_range[j].end); | ||
392 | end = max(node_memblk_range[i].start, | ||
393 | node_memblk_range[j].start); | ||
394 | for (k = 0; k < num_node_memblks; ++k) { | ||
395 | if (memblk_nodeid[i] == memblk_nodeid[k]) | ||
396 | continue; | ||
397 | if (start < node_memblk_range[k].end && | ||
398 | end > node_memblk_range[k].start) | ||
399 | break; | ||
400 | } | ||
401 | if (k < num_node_memblks) | ||
402 | continue; | ||
403 | start = min(node_memblk_range[i].start, | ||
404 | node_memblk_range[j].start); | ||
405 | end = max(node_memblk_range[i].end, | ||
406 | node_memblk_range[j].end); | ||
407 | printk(KERN_INFO "SRAT: Node %d " | ||
408 | "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", | ||
409 | memblk_nodeid[i], | ||
410 | node_memblk_range[i].start, | ||
411 | node_memblk_range[i].end, | ||
412 | node_memblk_range[j].start, | ||
413 | node_memblk_range[j].end, | ||
414 | start, end); | ||
415 | node_memblk_range[i].start = start; | ||
416 | node_memblk_range[i].end = end; | ||
417 | k = --num_node_memblks - j; | ||
418 | memmove(memblk_nodeid + j, memblk_nodeid + j+1, | ||
419 | k * sizeof(*memblk_nodeid)); | ||
420 | memmove(node_memblk_range + j, node_memblk_range + j+1, | ||
421 | k * sizeof(*node_memblk_range)); | ||
422 | --j; | ||
423 | } | ||
424 | } | ||
425 | |||
426 | memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, | ||
427 | memblk_nodeid); | ||
428 | if (memnode_shift < 0) { | ||
429 | printk(KERN_ERR | ||
430 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
431 | bad_srat(); | ||
432 | return -1; | ||
433 | } | ||
434 | |||
435 | for (i = 0; i < num_node_memblks; i++) | ||
436 | memblock_x86_register_active_regions(memblk_nodeid[i], | ||
437 | node_memblk_range[i].start >> PAGE_SHIFT, | ||
438 | node_memblk_range[i].end >> PAGE_SHIFT); | ||
439 | |||
440 | /* for out of order entries in SRAT */ | ||
441 | sort_node_map(); | ||
442 | if (!nodes_cover_memory(nodes)) { | ||
443 | bad_srat(); | ||
444 | return -1; | ||
445 | } | ||
446 | 241 | ||
447 | /* Account for nodes with cpus and no memory */ | 242 | ret = acpi_numa_init(); |
448 | nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); | 243 | if (ret < 0) |
449 | 244 | return ret; | |
450 | /* Finally register nodes */ | 245 | return srat_disabled() ? -EINVAL : 0; |
451 | for_each_node_mask(i, node_possible_map) | ||
452 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
453 | /* Try again in case setup_node_bootmem missed one due | ||
454 | to missing bootmem */ | ||
455 | for_each_node_mask(i, node_possible_map) | ||
456 | if (!node_online(i)) | ||
457 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
458 | |||
459 | for (i = 0; i < nr_cpu_ids; i++) { | ||
460 | int node = early_cpu_to_node(i); | ||
461 | |||
462 | if (node == NUMA_NO_NODE) | ||
463 | continue; | ||
464 | if (!node_online(node)) | ||
465 | numa_clear_node(i); | ||
466 | } | ||
467 | numa_init_array(); | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | #ifdef CONFIG_NUMA_EMU | ||
472 | static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { | ||
473 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
474 | }; | ||
475 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
476 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
477 | }; | ||
478 | static int __init find_node_by_addr(unsigned long addr) | ||
479 | { | ||
480 | int ret = NUMA_NO_NODE; | ||
481 | int i; | ||
482 | |||
483 | for_each_node_mask(i, nodes_parsed) { | ||
484 | /* | ||
485 | * Find the real node that this emulated node appears on. For | ||
486 | * the sake of simplicity, we only use a real node's starting | ||
487 | * address to determine which emulated node it appears on. | ||
488 | */ | ||
489 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
490 | ret = i; | ||
491 | break; | ||
492 | } | ||
493 | } | ||
494 | return ret; | ||
495 | } | 246 | } |
496 | 247 | ||
497 | /* | ||
498 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
499 | * mappings that respect the real ACPI topology but reflect our emulated | ||
500 | * environment. For each emulated node, we find which real node it appears on | ||
501 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
502 | * locality. SLIT will now represent the correct distances between emulated | ||
503 | * nodes as a result of the real topology. | ||
504 | */ | ||
505 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
506 | { | ||
507 | int i, j; | ||
508 | |||
509 | for (i = 0; i < num_nodes; i++) { | ||
510 | int nid, pxm; | ||
511 | |||
512 | nid = find_node_by_addr(fake_nodes[i].start); | ||
513 | if (nid == NUMA_NO_NODE) | ||
514 | continue; | ||
515 | pxm = node_to_pxm(nid); | ||
516 | if (pxm == PXM_INVAL) | ||
517 | continue; | ||
518 | fake_node_to_pxm_map[i] = pxm; | ||
519 | /* | ||
520 | * For each apicid_to_node mapping that exists for this real | ||
521 | * node, it must now point to the fake node ID. | ||
522 | */ | ||
523 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
524 | if (apicid_to_node[j] == nid && | ||
525 | fake_apicid_to_node[j] == NUMA_NO_NODE) | ||
526 | fake_apicid_to_node[j] = i; | ||
527 | } | ||
528 | |||
529 | /* | ||
530 | * If there are apicid-to-node mappings for physical nodes that do not | ||
531 | * have a corresponding emulated node, it should default to a guaranteed | ||
532 | * value. | ||
533 | */ | ||
534 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
535 | if (apicid_to_node[i] != NUMA_NO_NODE && | ||
536 | fake_apicid_to_node[i] == NUMA_NO_NODE) | ||
537 | fake_apicid_to_node[i] = 0; | ||
538 | |||
539 | for (i = 0; i < num_nodes; i++) | ||
540 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
541 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
542 | |||
543 | nodes_clear(nodes_parsed); | ||
544 | for (i = 0; i < num_nodes; i++) | ||
545 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
546 | node_set(i, nodes_parsed); | ||
547 | } | ||
548 | |||
549 | static int null_slit_node_compare(int a, int b) | ||
550 | { | ||
551 | return node_to_pxm(a) == node_to_pxm(b); | ||
552 | } | ||
553 | #else | ||
554 | static int null_slit_node_compare(int a, int b) | ||
555 | { | ||
556 | return a == b; | ||
557 | } | ||
558 | #endif /* CONFIG_NUMA_EMU */ | ||
559 | |||
560 | int __node_distance(int a, int b) | ||
561 | { | ||
562 | int index; | ||
563 | |||
564 | if (!acpi_slit) | ||
565 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
566 | REMOTE_DISTANCE; | ||
567 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
568 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
569 | } | ||
570 | |||
571 | EXPORT_SYMBOL(__node_distance); | ||
572 | |||
573 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) | 248 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) |
574 | int memory_add_physaddr_to_nid(u64 start) | 249 | int memory_add_physaddr_to_nid(u64 start) |
575 | { | 250 | { |