aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSrivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>2013-12-30 06:35:34 -0500
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2014-01-14 21:58:37 -0500
commitd4edc5b6c480a0917e61d93d55531d7efa6230be (patch)
tree449a4b3402bd7aad83b1c45501a64d0c829d4917
parent0c4b9e27b09eeb4da84451c038a587b92ce93ff5 (diff)
powerpc: Fix the setup of CPU-to-Node mappings during CPU online
On POWER platforms, the hypervisor can notify the guest kernel about dynamic changes in the cpu-numa associativity (VPHN topology update). Hence the cpu-to-node mappings that we got from the firmware during boot, may no longer be valid after such updates. This is handled using the arch_update_cpu_topology() hook in the scheduler, and the sched-domains are rebuilt according to the new mappings. But unfortunately, at the moment, CPU hotplug ignores these updated mappings and instead queries the firmware for the cpu-to-numa relationships and uses them during CPU online. So the kernel can end up assigning wrong NUMA nodes to CPUs during subsequent CPU hotplug online operations (after booting). Further, a particularly problematic scenario can result from this bug: On POWER platforms, the SMT mode can be switched between 1, 2, 4 (and even 8) threads per core. The switch to Single-Threaded (ST) mode is performed by offlining all except the first CPU thread in each core. Switching back to SMT mode involves onlining those other threads back, in each core. Now consider this scenario: 1. During boot, the kernel gets the cpu-to-node mappings from the firmware and assigns the CPUs to NUMA nodes appropriately, during CPU online. 2. Later on, the hypervisor updates the cpu-to-node mappings dynamically and communicates this update to the kernel. The kernel in turn updates its cpu-to-node associations and rebuilds its sched domains. Everything is fine so far. 3. Now, the user switches the machine from SMT to ST mode (say, by running ppc64_cpu --smt=1). This involves offlining all except 1 thread in each core. 4. The user then tries to switch back from ST to SMT mode (say, by running ppc64_cpu --smt=4), and this involves onlining those threads back. Since CPU hotplug ignores the new mappings, it queries the firmware and tries to associate the newly onlined sibling threads to the old NUMA nodes. This results in sibling threads within the same core getting associated with different NUMA nodes, which is incorrect. The scheduler's build-sched-domains code gets thoroughly confused with this and enters an infinite loop and causes soft-lockups, as explained in detail in commit 3be7db6ab (powerpc: VPHN topology change updates all siblings). So to fix this, use the numa_cpu_lookup_table to remember the updated cpu-to-node mappings, and use them during CPU hotplug online operations. Further, we also need to ensure that all threads in a core are assigned to a common NUMA node, irrespective of whether all those threads were online during the topology update. To achieve this, we take care not to use cpu_sibling_mask() since it is not hotplug invariant. Instead, we use cpu_first_sibling_thread() and set up the mappings manually using the 'threads_per_core' value for that particular platform. This helps us ensure that we don't hit this bug with any combination of CPU hotplug and SMT mode switching. Cc: stable@vger.kernel.org Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/include/asm/topology.h10
-rw-r--r--arch/powerpc/mm/numa.c70
2 files changed, 76 insertions, 4 deletions
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 89e3ef2496ac..d0b5fca6b077 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -22,7 +22,15 @@ struct device_node;
22 22
23static inline int cpu_to_node(int cpu) 23static inline int cpu_to_node(int cpu)
24{ 24{
25 return numa_cpu_lookup_table[cpu]; 25 int nid;
26
27 nid = numa_cpu_lookup_table[cpu];
28
29 /*
30 * During early boot, the numa-cpu lookup table might not have been
31 * setup for all CPUs yet. In such cases, default to node 0.
32 */
33 return (nid < 0) ? 0 : nid;
26} 34}
27 35
28#define parent_node(node) (node) 36#define parent_node(node) (node)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 078d3e00a616..6847d509162f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -31,6 +31,8 @@
31#include <asm/sparsemem.h> 31#include <asm/sparsemem.h>
32#include <asm/prom.h> 32#include <asm/prom.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/cputhreads.h>
35#include <asm/topology.h>
34#include <asm/firmware.h> 36#include <asm/firmware.h>
35#include <asm/paca.h> 37#include <asm/paca.h>
36#include <asm/hvcall.h> 38#include <asm/hvcall.h>
@@ -152,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,
152 } 154 }
153} 155}
154 156
155static void map_cpu_to_node(int cpu, int node) 157static void reset_numa_cpu_lookup_table(void)
158{
159 unsigned int cpu;
160
161 for_each_possible_cpu(cpu)
162 numa_cpu_lookup_table[cpu] = -1;
163}
164
165static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
156{ 166{
157 numa_cpu_lookup_table[cpu] = node; 167 numa_cpu_lookup_table[cpu] = node;
168}
169
170static void map_cpu_to_node(int cpu, int node)
171{
172 update_numa_cpu_lookup_table(cpu, node);
158 173
159 dbg("adding cpu %d to node %d\n", cpu, node); 174 dbg("adding cpu %d to node %d\n", cpu, node);
160 175
@@ -522,11 +537,24 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
522 */ 537 */
523static int numa_setup_cpu(unsigned long lcpu) 538static int numa_setup_cpu(unsigned long lcpu)
524{ 539{
525 int nid = 0; 540 int nid;
526 struct device_node *cpu = of_get_cpu_node(lcpu, NULL); 541 struct device_node *cpu;
542
543 /*
544 * If a valid cpu-to-node mapping is already available, use it
545 * directly instead of querying the firmware, since it represents
546 * the most recent mapping notified to us by the platform (eg: VPHN).
547 */
548 if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
549 map_cpu_to_node(lcpu, nid);
550 return nid;
551 }
552
553 cpu = of_get_cpu_node(lcpu, NULL);
527 554
528 if (!cpu) { 555 if (!cpu) {
529 WARN_ON(1); 556 WARN_ON(1);
557 nid = 0;
530 goto out; 558 goto out;
531 } 559 }
532 560
@@ -1067,6 +1095,7 @@ void __init do_init_bootmem(void)
1067 */ 1095 */
1068 setup_node_to_cpumask_map(); 1096 setup_node_to_cpumask_map();
1069 1097
1098 reset_numa_cpu_lookup_table();
1070 register_cpu_notifier(&ppc64_numa_nb); 1099 register_cpu_notifier(&ppc64_numa_nb);
1071 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE, 1100 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1072 (void *)(unsigned long)boot_cpuid); 1101 (void *)(unsigned long)boot_cpuid);
@@ -1445,6 +1474,33 @@ static int update_cpu_topology(void *data)
1445 return 0; 1474 return 0;
1446} 1475}
1447 1476
1477static int update_lookup_table(void *data)
1478{
1479 struct topology_update_data *update;
1480
1481 if (!data)
1482 return -EINVAL;
1483
1484 /*
1485 * Upon topology update, the numa-cpu lookup table needs to be updated
1486 * for all threads in the core, including offline CPUs, to ensure that
1487 * future hotplug operations respect the cpu-to-node associativity
1488 * properly.
1489 */
1490 for (update = data; update; update = update->next) {
1491 int nid, base, j;
1492
1493 nid = update->new_nid;
1494 base = cpu_first_thread_sibling(update->cpu);
1495
1496 for (j = 0; j < threads_per_core; j++) {
1497 update_numa_cpu_lookup_table(base + j, nid);
1498 }
1499 }
1500
1501 return 0;
1502}
1503
1448/* 1504/*
1449 * Update the node maps and sysfs entries for each cpu whose home node 1505 * Update the node maps and sysfs entries for each cpu whose home node
1450 * has changed. Returns 1 when the topology has changed, and 0 otherwise. 1506 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
@@ -1513,6 +1569,14 @@ int arch_update_cpu_topology(void)
1513 1569
1514 stop_machine(update_cpu_topology, &updates[0], &updated_cpus); 1570 stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
1515 1571
1572 /*
1573 * Update the numa-cpu lookup table with the new mappings, even for
1574 * offline CPUs. It is best to perform this update from the stop-
1575 * machine context.
1576 */
1577 stop_machine(update_lookup_table, &updates[0],
1578 cpumask_of(raw_smp_processor_id()));
1579
1516 for (ud = &updates[0]; ud; ud = ud->next) { 1580 for (ud = &updates[0]; ud; ud = ud->next) {
1517 unregister_cpu_under_node(ud->cpu, ud->old_nid); 1581 unregister_cpu_under_node(ud->cpu, ud->old_nid);
1518 register_cpu_under_node(ud->cpu, ud->new_nid); 1582 register_cpu_under_node(ud->cpu, ud->new_nid);