summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSrikar Dronamraju <srikar@linux.vnet.ibm.com>2018-08-17 10:54:39 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2018-08-21 02:01:59 -0400
commit2ea62630681027c455117aa471ea3ab8bb099ead (patch)
tree17f63129cfaf10b46ce2e498ff22be103adadda5
parentd6ee76d3d37d156c479348821574b6f99d6472a1 (diff)
powerpc/topology: Get topology for shared processors at boot
On a shared LPAR, Phyp will not update the CPU associativity at boot time. Just after the boot system does recognize itself as a shared LPAR and trigger a request for correct CPU associativity. But by then the scheduler would have already created/destroyed its sched domains. This causes - Broken load balance across Nodes causing islands of cores. - Performance degradation esp if the system is lightly loaded - dmesg to wrongly report all CPUs to be in Node 0. - Messages in dmesg saying borken topology. - With commit 051f3ca02e46 ("sched/topology: Introduce NUMA identity node sched domain"), can cause rcu stalls at boot up. The sched_domains_numa_masks table which is used to generate cpumasks is only created at boot time just before creating sched domains and never updated. Hence, its better to get the topology correct before the sched domains are created. For example on 64 core Power 8 shared LPAR, dmesg reports Brought up 512 CPUs Node 0 CPUs: 0-511 Node 1 CPUs: Node 2 CPUs: Node 3 CPUs: Node 4 CPUs: Node 5 CPUs: Node 6 CPUs: Node 7 CPUs: Node 8 CPUs: Node 9 CPUs: Node 10 CPUs: Node 11 CPUs: ... BUG: arch topology borken the DIE domain not a subset of the NUMA domain BUG: arch topology borken the DIE domain not a subset of the NUMA domain numactl/lscpu output will still be correct with cores spreading across all nodes: Socket(s): 64 NUMA node(s): 12 Model: 2.0 (pvr 004d 0200) Model name: POWER8 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 64K L1i cache: 32K NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471 NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479 NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487 NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495 NUMA node4 CPU(s): 208-215,304-311,400-407,496-503 NUMA node5 CPU(s): 168-175,264-271,360-367,456-463 NUMA node6 CPU(s): 128-135,224-231,320-327,416-423 NUMA node7 CPU(s): 136-143,232-239,328-335,424-431 NUMA node8 CPU(s): 216-223,312-319,408-415,504-511 NUMA node9 CPU(s): 144-151,240-247,336-343,432-439 NUMA node10 CPU(s): 152-159,248-255,344-351,440-447 NUMA node11 CPU(s): 160-167,256-263,352-359,448-455 Currently on this LPAR, the scheduler detects 2 levels of Numa and created numa sched domains for all CPUs, but it finds a single DIE domain consisting of all CPUs. Hence it deletes all numa sched domains. To address this, detect the shared processor and update topology soon after CPUs are setup so that correct topology is updated just before scheduler creates sched domain. With the fix, dmesg reports: numa: Node 0 CPUs: 0-7 32-39 64-71 96-103 176-183 272-279 368-375 464-471 numa: Node 1 CPUs: 8-15 40-47 72-79 104-111 184-191 280-287 376-383 472-479 numa: Node 2 CPUs: 16-23 48-55 80-87 112-119 192-199 288-295 384-391 480-487 numa: Node 3 CPUs: 24-31 56-63 88-95 120-127 200-207 296-303 392-399 488-495 numa: Node 4 CPUs: 208-215 304-311 400-407 496-503 numa: Node 5 CPUs: 168-175 264-271 360-367 456-463 numa: Node 6 CPUs: 128-135 224-231 320-327 416-423 numa: Node 7 CPUs: 136-143 232-239 328-335 424-431 numa: Node 8 CPUs: 216-223 312-319 408-415 504-511 numa: Node 9 CPUs: 144-151 240-247 336-343 432-439 numa: Node 10 CPUs: 152-159 248-255 344-351 440-447 numa: Node 11 CPUs: 160-167 256-263 352-359 448-455 and lscpu also reports: Socket(s): 64 NUMA node(s): 12 Model: 2.0 (pvr 004d 0200) Model name: POWER8 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 64K L1i cache: 32K NUMA node0 CPU(s): 0-7,32-39,64-71,96-103,176-183,272-279,368-375,464-471 NUMA node1 CPU(s): 8-15,40-47,72-79,104-111,184-191,280-287,376-383,472-479 NUMA node2 CPU(s): 16-23,48-55,80-87,112-119,192-199,288-295,384-391,480-487 NUMA node3 CPU(s): 24-31,56-63,88-95,120-127,200-207,296-303,392-399,488-495 NUMA node4 CPU(s): 208-215,304-311,400-407,496-503 NUMA node5 CPU(s): 168-175,264-271,360-367,456-463 NUMA node6 CPU(s): 128-135,224-231,320-327,416-423 NUMA node7 CPU(s): 136-143,232-239,328-335,424-431 NUMA node8 CPU(s): 216-223,312-319,408-415,504-511 NUMA node9 CPU(s): 144-151,240-247,336-343,432-439 NUMA node10 CPU(s): 152-159,248-255,344-351,440-447 NUMA node11 CPU(s): 160-167,256-263,352-359,448-455 Reported-by: Manjunatha H R <manjuhr1@in.ibm.com> Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> [mpe: Trim / format change log] Tested-by: Michael Ellerman <mpe@ellerman.id.au> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
-rw-r--r--arch/powerpc/include/asm/topology.h5
-rw-r--r--arch/powerpc/kernel/smp.c5
-rw-r--r--arch/powerpc/mm/numa.c20
3 files changed, 20 insertions, 10 deletions
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 16b077801a5f..a4a718dbfec6 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -92,6 +92,7 @@ extern int stop_topology_update(void);
92extern int prrn_is_enabled(void); 92extern int prrn_is_enabled(void);
93extern int find_and_online_cpu_nid(int cpu); 93extern int find_and_online_cpu_nid(int cpu);
94extern int timed_topology_update(int nsecs); 94extern int timed_topology_update(int nsecs);
95extern void __init shared_proc_topology_init(void);
95#else 96#else
96static inline int start_topology_update(void) 97static inline int start_topology_update(void)
97{ 98{
@@ -113,6 +114,10 @@ static inline int timed_topology_update(int nsecs)
113{ 114{
114 return 0; 115 return 0;
115} 116}
117
118#ifdef CONFIG_SMP
119static inline void shared_proc_topology_init(void) {}
120#endif
116#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ 121#endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
117 122
118#include <asm-generic/topology.h> 123#include <asm-generic/topology.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index b19d832ef386..61c1fadbc644 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1160,6 +1160,11 @@ void __init smp_cpus_done(unsigned int max_cpus)
1160 if (smp_ops && smp_ops->bringup_done) 1160 if (smp_ops && smp_ops->bringup_done)
1161 smp_ops->bringup_done(); 1161 smp_ops->bringup_done();
1162 1162
1163 /*
1164 * On a shared LPAR, associativity needs to be requested.
1165 * Hence, get numa topology before dumping cpu topology
1166 */
1167 shared_proc_topology_init();
1163 dump_numa_cpu_topology(); 1168 dump_numa_cpu_topology();
1164 1169
1165 /* 1170 /*
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 0c7e05d89244..35ac5422903a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1078,7 +1078,6 @@ static int prrn_enabled;
1078static void reset_topology_timer(void); 1078static void reset_topology_timer(void);
1079static int topology_timer_secs = 1; 1079static int topology_timer_secs = 1;
1080static int topology_inited; 1080static int topology_inited;
1081static int topology_update_needed;
1082 1081
1083/* 1082/*
1084 * Change polling interval for associativity changes. 1083 * Change polling interval for associativity changes.
@@ -1306,11 +1305,8 @@ int numa_update_cpu_topology(bool cpus_locked)
1306 struct device *dev; 1305 struct device *dev;
1307 int weight, new_nid, i = 0; 1306 int weight, new_nid, i = 0;
1308 1307
1309 if (!prrn_enabled && !vphn_enabled) { 1308 if (!prrn_enabled && !vphn_enabled && topology_inited)
1310 if (!topology_inited)
1311 topology_update_needed = 1;
1312 return 0; 1309 return 0;
1313 }
1314 1310
1315 weight = cpumask_weight(&cpu_associativity_changes_mask); 1311 weight = cpumask_weight(&cpu_associativity_changes_mask);
1316 if (!weight) 1312 if (!weight)
@@ -1423,7 +1419,6 @@ int numa_update_cpu_topology(bool cpus_locked)
1423 1419
1424out: 1420out:
1425 kfree(updates); 1421 kfree(updates);
1426 topology_update_needed = 0;
1427 return changed; 1422 return changed;
1428} 1423}
1429 1424
@@ -1551,6 +1546,15 @@ int prrn_is_enabled(void)
1551 return prrn_enabled; 1546 return prrn_enabled;
1552} 1547}
1553 1548
1549void __init shared_proc_topology_init(void)
1550{
1551 if (lppaca_shared_proc(get_lppaca())) {
1552 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1553 nr_cpumask_bits);
1554 numa_update_cpu_topology(false);
1555 }
1556}
1557
1554static int topology_read(struct seq_file *file, void *v) 1558static int topology_read(struct seq_file *file, void *v)
1555{ 1559{
1556 if (vphn_enabled || prrn_enabled) 1560 if (vphn_enabled || prrn_enabled)
@@ -1608,10 +1612,6 @@ static int topology_update_init(void)
1608 return -ENOMEM; 1612 return -ENOMEM;
1609 1613
1610 topology_inited = 1; 1614 topology_inited = 1;
1611 if (topology_update_needed)
1612 bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
1613 nr_cpumask_bits);
1614
1615 return 0; 1615 return 0;
1616} 1616}
1617device_initcall(topology_update_init); 1617device_initcall(topology_update_init);