aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichael Holzheu <holzheu@linux.vnet.ibm.com>2014-03-06 12:47:21 -0500
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2015-08-04 08:06:53 -0400
commitc29a7baf091fc6b2c9e40561030f8c62e6145a19 (patch)
treedfddc7a273858c32c9946857bfff2dc7779e64a9
parente8054b654bf5d4f549f4f24b708acce6d2718b1b (diff)
s390/numa: add emulation support
NUMA emulation (aka fake NUMA) distributes the available memory to nodes without using real topology information about the physical memory of the machine. Splitting the system memory into nodes replicates the memory management structures for each node. Particularly each node has its own "mm locks" and its own "kswapd" task. For large systems, under certain conditions, this results in improved system performance and/or latency based on reduced pressure on the mm locks and the kswapd tasks. NUMA emulation distributes CPUs to nodes while respecting the original machine topology information. This is done by trying to avoid to separate CPUs which reside on the same book or even on the same MC. Because the current Linux scheduler code requires a stable cpu to node mapping, cores are pinned to nodes when the first CPU thread is set online. This patch is based on the initial implementation from Philipp Hachtmann. Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
-rw-r--r--arch/s390/Kconfig37
-rw-r--r--arch/s390/include/asm/numa.h4
-rw-r--r--arch/s390/numa/Makefile1
-rw-r--r--arch/s390/numa/mode_emu.c511
-rw-r--r--arch/s390/numa/numa.c4
-rw-r--r--arch/s390/numa/numa_mode.h1
-rw-r--r--drivers/s390/char/sclp_cmd.c18
7 files changed, 569 insertions, 7 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 25510adb07d3..cb418dcc2d45 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -423,6 +423,43 @@ config NODES_SHIFT
423 Specify the maximum number of NUMA nodes available on the target 423 Specify the maximum number of NUMA nodes available on the target
424 system. Increases memory reserved to accommodate various tables. 424 system. Increases memory reserved to accommodate various tables.
425 425
426menu "Select NUMA modes"
427 depends on NUMA
428
429config NUMA_EMU
430 bool "NUMA emulation"
431 default y
432 help
433 Numa emulation mode will split the available system memory into
434 equal chunks which then are distributed over the configured number
435 of nodes in a round-robin manner.
436
437 The number of fake nodes is limited by the number of available memory
438 chunks (i.e. memory size / fake size) and the number of supported
439 nodes in the kernel.
440
441 The CPUs are assigned to the nodes in a way that partially respects
442 the original machine topology (if supported by the machine).
443 Fair distribution of the CPUs is not guaranteed.
444
445config EMU_SIZE
446 hex "NUMA emulation memory chunk size"
447 default 0x10000000
448 range 0x400000 0x100000000
449 depends on NUMA_EMU
450 help
451 Select the default size by which the memory is chopped and then
452 assigned to emulated NUMA nodes.
453
454 This can be overridden by specifying
455
456 emu_size=<n>
457
458 on the kernel command line where also suffixes K, M, G, and T are
459 supported.
460
461endmenu
462
426config SCHED_MC 463config SCHED_MC
427 def_bool n 464 def_bool n
428 465
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
index ea4edbfba9f6..2a0efc63b9e5 100644
--- a/arch/s390/include/asm/numa.h
+++ b/arch/s390/include/asm/numa.h
@@ -26,6 +26,10 @@ extern int numa_debug_enabled;
26 26
27static inline void numa_setup(void) { } 27static inline void numa_setup(void) { }
28static inline void numa_update_cpu_topology(void) { } 28static inline void numa_update_cpu_topology(void) { }
29static inline int numa_pfn_to_nid(unsigned long pfn)
30{
31 return 0;
32}
29 33
30#endif /* CONFIG_NUMA */ 34#endif /* CONFIG_NUMA */
31#endif /* _ASM_S390_NUMA_H */ 35#endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
index 31372293b62e..f94ecaffa71b 100644
--- a/arch/s390/numa/Makefile
+++ b/arch/s390/numa/Makefile
@@ -1,2 +1,3 @@
1obj-y += numa.o 1obj-y += numa.o
2obj-y += toptree.o 2obj-y += toptree.o
3obj-$(CONFIG_NUMA_EMU) += mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
new file mode 100644
index 000000000000..9d4e1e15a6f0
--- /dev/null
+++ b/arch/s390/numa/mode_emu.c
@@ -0,0 +1,511 @@
1/*
2 * NUMA support for s390
3 *
4 * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
5 * without using real topology information about the physical memory of the
6 * machine.
7 *
8 * It distributes the available CPUs to nodes while respecting the original
9 * machine topology information. This is done by trying to avoid to separate
10 * CPUs which reside on the same book or even on the same MC.
11 *
12 * Because the current Linux scheduler code requires a stable cpu to node
13 * mapping, cores are pinned to nodes when the first CPU thread is set online.
14 *
15 * Copyright IBM Corp. 2015
16 */
17
18#define KMSG_COMPONENT "numa_emu"
19#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
20
21#include <linux/kernel.h>
22#include <linux/cpumask.h>
23#include <linux/memblock.h>
24#include <linux/node.h>
25#include <linux/memory.h>
26#include <asm/smp.h>
27#include <asm/topology.h>
28#include "numa_mode.h"
29#include "toptree.h"
30
31/* Distances between the different system components */
32#define DIST_EMPTY 0
33#define DIST_CORE 1
34#define DIST_MC 2
35#define DIST_BOOK 3
36#define DIST_MAX 4
37
38/* Node distance reported to common code */
39#define EMU_NODE_DIST 10
40
41/* Node ID for free (not yet pinned) cores */
42#define NODE_ID_FREE -1
43
44/* Different levels of toptree */
45enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
46
47/* The two toptree IDs */
48enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
49
50/* Number of NUMA nodes */
51static int emu_nodes = 1;
52/* NUMA stripe size */
53static unsigned long emu_size;
54/* Pinned core to node mapping */
55static int cores_to_node_id[CONFIG_NR_CPUS];
56/* Total number of pinned cores */
57static int cores_total;
58/* Number of cores per node without extra cores */
59static int cores_per_node_target;
60/* Number of cores pinned to node */
61static int cores_per_node[MAX_NUMNODES];
62
63/*
64 * Pin a core to a node
65 */
66static void pin_core_to_node(int core_id, int node_id)
67{
68 if (cores_to_node_id[core_id] == NODE_ID_FREE) {
69 cores_per_node[node_id]++;
70 cores_to_node_id[core_id] = node_id;
71 cores_total++;
72 } else {
73 WARN_ON(cores_to_node_id[core_id] != node_id);
74 }
75}
76
77/*
78 * Number of pinned cores of a node
79 */
80static int cores_pinned(struct toptree *node)
81{
82 return cores_per_node[node->id];
83}
84
85/*
86 * ID of the node where the core is pinned (or NODE_ID_FREE)
87 */
88static int core_pinned_to_node_id(struct toptree *core)
89{
90 return cores_to_node_id[core->id];
91}
92
93/*
94 * Number of cores in the tree that are not yet pinned
95 */
96static int cores_free(struct toptree *tree)
97{
98 struct toptree *core;
99 int count = 0;
100
101 toptree_for_each(core, tree, CORE) {
102 if (core_pinned_to_node_id(core) == NODE_ID_FREE)
103 count++;
104 }
105 return count;
106}
107
108/*
109 * Return node of core
110 */
111static struct toptree *core_node(struct toptree *core)
112{
113 return core->parent->parent->parent;
114}
115
116/*
117 * Return book of core
118 */
119static struct toptree *core_book(struct toptree *core)
120{
121 return core->parent->parent;
122}
123
124/*
125 * Return mc of core
126 */
127static struct toptree *core_mc(struct toptree *core)
128{
129 return core->parent;
130}
131
132/*
133 * Distance between two cores
134 */
135static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
136{
137 if (core_book(core1)->id != core_book(core2)->id)
138 return DIST_BOOK;
139 if (core_mc(core1)->id != core_mc(core2)->id)
140 return DIST_MC;
141 /* Same core or sibling on same MC */
142 return DIST_CORE;
143}
144
145/*
146 * Distance of a node to a core
147 */
148static int dist_node_to_core(struct toptree *node, struct toptree *core)
149{
150 struct toptree *core_node;
151 int dist_min = DIST_MAX;
152
153 toptree_for_each(core_node, node, CORE)
154 dist_min = min(dist_min, dist_core_to_core(core_node, core));
155 return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
156}
157
158/*
159 * Unify will delete empty nodes, therefore recreate nodes.
160 */
161static void toptree_unify_tree(struct toptree *tree)
162{
163 int nid;
164
165 toptree_unify(tree);
166 for (nid = 0; nid < emu_nodes; nid++)
167 toptree_get_child(tree, nid);
168}
169
170/*
171 * Find the best/nearest node for a given core and ensure that no node
172 * gets more than "cores_per_node_target + extra" cores.
173 */
174static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
175 int extra)
176{
177 struct toptree *node, *node_best = NULL;
178 int dist_cur, dist_best;
179
180 dist_best = DIST_MAX;
181 node_best = NULL;
182 toptree_for_each(node, numa, NODE) {
183 /* Already pinned cores must use their nodes */
184 if (core_pinned_to_node_id(core) == node->id) {
185 node_best = node;
186 break;
187 }
188 /* Skip nodes that already have enough cores */
189 if (cores_pinned(node) >= cores_per_node_target + extra)
190 continue;
191 dist_cur = dist_node_to_core(node, core);
192 if (dist_cur < dist_best) {
193 dist_best = dist_cur;
194 node_best = node;
195 }
196 }
197 return node_best;
198}
199
200/*
201 * Find the best node for each core with respect to "extra" core count
202 */
203static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
204 int extra)
205{
206 struct toptree *node, *core, *tmp;
207
208 toptree_for_each_safe(core, tmp, phys, CORE) {
209 node = node_for_core(numa, core, extra);
210 if (!node)
211 return;
212 toptree_move(core, node);
213 pin_core_to_node(core->id, node->id);
214 }
215}
216
217/*
218 * Move structures of given level to specified NUMA node
219 */
220static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
221 enum toptree_level level, bool perfect)
222{
223 struct toptree *cur, *tmp;
224 int cores_free;
225
226 toptree_for_each_safe(cur, tmp, phys, level) {
227 cores_free = cores_per_node_target - toptree_count(node, CORE);
228 if (perfect) {
229 if (cores_free == toptree_count(cur, CORE))
230 toptree_move(cur, node);
231 } else {
232 if (cores_free >= toptree_count(cur, CORE))
233 toptree_move(cur, node);
234 }
235 }
236}
237
238/*
239 * Move structures of a given level to NUMA nodes. If "perfect" is specified
240 * move only perfectly fitting structures. Otherwise move also smaller
241 * than needed structures.
242 */
243static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
244 enum toptree_level level, bool perfect)
245{
246 struct toptree *node;
247
248 toptree_for_each(node, numa, NODE)
249 move_level_to_numa_node(node, phys, level, perfect);
250}
251
252/*
253 * For the first run try to move the big structures
254 */
255static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
256{
257 struct toptree *core;
258
259 /* Always try to move perfectly fitting structures first */
260 move_level_to_numa(numa, phys, BOOK, true);
261 move_level_to_numa(numa, phys, BOOK, false);
262 move_level_to_numa(numa, phys, MC, true);
263 move_level_to_numa(numa, phys, MC, false);
264 /* Now pin all the moved cores */
265 toptree_for_each(core, numa, CORE)
266 pin_core_to_node(core->id, core_node(core)->id);
267}
268
269/*
270 * Allocate new topology and create required nodes
271 */
272static struct toptree *toptree_new(int id, int nodes)
273{
274 struct toptree *tree;
275 int nid;
276
277 tree = toptree_alloc(TOPOLOGY, id);
278 if (!tree)
279 goto fail;
280 for (nid = 0; nid < nodes; nid++) {
281 if (!toptree_get_child(tree, nid))
282 goto fail;
283 }
284 return tree;
285fail:
286 panic("NUMA emulation could not allocate topology");
287}
288
289/*
290 * Move cores from physical topology into NUMA target topology
291 * and try to keep as much of the physical topology as possible.
292 */
293static struct toptree *toptree_to_numa(struct toptree *phys)
294{
295 static int first = 1;
296 struct toptree *numa;
297
298 cores_per_node_target = (cores_total + cores_free(phys)) / emu_nodes;
299 numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
300 if (first) {
301 toptree_to_numa_first(numa, phys);
302 first = 0;
303 }
304 toptree_to_numa_single(numa, phys, 0);
305 toptree_to_numa_single(numa, phys, 1);
306 toptree_unify_tree(numa);
307
308 WARN_ON(cpumask_weight(&phys->mask));
309 return numa;
310}
311
312/*
313 * Create a toptree out of the physical topology that we got from the hypervisor
314 */
315static struct toptree *toptree_from_topology(void)
316{
317 struct toptree *phys, *node, *book, *mc, *core;
318 struct cpu_topology_s390 *top;
319 int cpu;
320
321 phys = toptree_new(TOPTREE_ID_PHYS, 1);
322
323 for_each_online_cpu(cpu) {
324 top = &per_cpu(cpu_topology, cpu);
325 node = toptree_get_child(phys, 0);
326 book = toptree_get_child(node, top->book_id);
327 mc = toptree_get_child(book, top->socket_id);
328 core = toptree_get_child(mc, top->core_id);
329 if (!book || !mc || !core)
330 panic("NUMA emulation could not allocate memory");
331 cpumask_set_cpu(cpu, &core->mask);
332 toptree_update_mask(mc);
333 }
334 return phys;
335}
336
337/*
338 * Add toptree core to topology and create correct CPU masks
339 */
340static void topology_add_core(struct toptree *core)
341{
342 struct cpu_topology_s390 *top;
343 int cpu;
344
345 for_each_cpu(cpu, &core->mask) {
346 top = &per_cpu(cpu_topology, cpu);
347 cpumask_copy(&top->thread_mask, &core->mask);
348 cpumask_copy(&top->core_mask, &core_mc(core)->mask);
349 cpumask_copy(&top->book_mask, &core_book(core)->mask);
350 cpumask_set_cpu(cpu, node_to_cpumask_map[core_node(core)->id]);
351 top->node_id = core_node(core)->id;
352 }
353}
354
355/*
356 * Apply toptree to topology and create CPU masks
357 */
358static void toptree_to_topology(struct toptree *numa)
359{
360 struct toptree *core;
361 int i;
362
363 /* Clear all node masks */
364 for (i = 0; i < MAX_NUMNODES; i++)
365 cpumask_clear(node_to_cpumask_map[i]);
366
367 /* Rebuild all masks */
368 toptree_for_each(core, numa, CORE)
369 topology_add_core(core);
370}
371
372/*
373 * Show the node to core mapping
374 */
375static void print_node_to_core_map(void)
376{
377 int nid, cid;
378
379 if (!numa_debug_enabled)
380 return;
381 printk(KERN_DEBUG "NUMA node to core mapping\n");
382 for (nid = 0; nid < emu_nodes; nid++) {
383 printk(KERN_DEBUG " node %3d: ", nid);
384 for (cid = 0; cid < ARRAY_SIZE(cores_to_node_id); cid++) {
385 if (cores_to_node_id[cid] == nid)
386 printk(KERN_CONT "%d ", cid);
387 }
388 printk(KERN_CONT "\n");
389 }
390}
391
392/*
393 * Transfer physical topology into a NUMA topology and modify CPU masks
394 * according to the NUMA topology.
395 *
396 * This function is called under the CPU hotplug lock.
397 */
398static void emu_update_cpu_topology(void)
399{
400 struct toptree *phys, *numa;
401
402 phys = toptree_from_topology();
403 numa = toptree_to_numa(phys);
404 toptree_free(phys);
405 toptree_to_topology(numa);
406 toptree_free(numa);
407 print_node_to_core_map();
408}
409
410/*
411 * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
412 * alignment (needed for memory hotplug).
413 */
414static unsigned long emu_setup_size_adjust(unsigned long size)
415{
416 size = size ? : CONFIG_EMU_SIZE;
417 size = roundup(size, memory_block_size_bytes());
418 return size;
419}
420
421/*
422 * If we have not enough memory for the specified nodes, reduce the node count.
423 */
424static int emu_setup_nodes_adjust(int nodes)
425{
426 int nodes_max;
427
428 nodes_max = memblock.memory.total_size / emu_size;
429 nodes_max = max(nodes_max, 1);
430 if (nodes_max >= nodes)
431 return nodes;
432 pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
433 return nodes_max;
434}
435
436/*
437 * Early emu setup
438 */
439static void emu_setup(void)
440{
441 int i;
442
443 emu_size = emu_setup_size_adjust(emu_size);
444 emu_nodes = emu_setup_nodes_adjust(emu_nodes);
445 for (i = 0; i < ARRAY_SIZE(cores_to_node_id); i++)
446 cores_to_node_id[i] = NODE_ID_FREE;
447 pr_info("Creating %d nodes with memory stripe size %ld MB\n",
448 emu_nodes, emu_size >> 20);
449}
450
451/*
452 * Return node id for given page number
453 */
454static int emu_pfn_to_nid(unsigned long pfn)
455{
456 return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
457}
458
459/*
460 * Return stripe size
461 */
462static unsigned long emu_align(void)
463{
464 return emu_size;
465}
466
467/*
468 * Return distance between two nodes
469 */
470static int emu_distance(int node1, int node2)
471{
472 return (node1 != node2) * EMU_NODE_DIST;
473}
474
475/*
476 * Define callbacks for generic s390 NUMA infrastructure
477 */
478const struct numa_mode numa_mode_emu = {
479 .name = "emu",
480 .setup = emu_setup,
481 .update_cpu_topology = emu_update_cpu_topology,
482 .__pfn_to_nid = emu_pfn_to_nid,
483 .align = emu_align,
484 .distance = emu_distance,
485};
486
487/*
488 * Kernel parameter: emu_nodes=<n>
489 */
490static int __init early_parse_emu_nodes(char *p)
491{
492 int count;
493
494 if (kstrtoint(p, 0, &count) != 0 || count <= 0)
495 return 0;
496 if (count <= 0)
497 return 0;
498 emu_nodes = min(count, MAX_NUMNODES);
499 return 0;
500}
501early_param("emu_nodes", early_parse_emu_nodes);
502
503/*
504 * Kernel parameter: emu_size=[<n>[k|M|G|T]]
505 */
506static int __init early_parse_emu_size(char *p)
507{
508 emu_size = memparse(p, NULL);
509 return 0;
510}
511early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
index 0416a3671e33..09b1d2355bd9 100644
--- a/arch/s390/numa/numa.c
+++ b/arch/s390/numa/numa.c
@@ -175,6 +175,10 @@ static int __init parse_numa(char *parm)
175{ 175{
176 if (strcmp(parm, numa_mode_plain.name) == 0) 176 if (strcmp(parm, numa_mode_plain.name) == 0)
177 mode = &numa_mode_plain; 177 mode = &numa_mode_plain;
178#ifdef CONFIG_NUMA_EMU
179 if (strcmp(parm, numa_mode_emu.name) == 0)
180 mode = &numa_mode_emu;
181#endif
178 return 0; 182 return 0;
179} 183}
180early_param("numa", parse_numa); 184early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
index 775659848011..08953b0b1c7f 100644
--- a/arch/s390/numa/numa_mode.h
+++ b/arch/s390/numa/numa_mode.h
@@ -19,5 +19,6 @@ struct numa_mode {
19}; 19};
20 20
21extern const struct numa_mode numa_mode_plain; 21extern const struct numa_mode numa_mode_plain;
22extern const struct numa_mode numa_mode_emu;
22 23
23#endif /* __S390_NUMA_MODE_H */ 24#endif /* __S390_NUMA_MODE_H */
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index e9485fbbb373..806239c2cf2f 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/page.h> 26#include <asm/page.h>
27#include <asm/sclp.h> 27#include <asm/sclp.h>
28#include <asm/numa.h>
28 29
29#include "sclp.h" 30#include "sclp.h"
30 31
@@ -388,11 +389,11 @@ static struct notifier_block sclp_mem_nb = {
388}; 389};
389 390
390static void __init align_to_block_size(unsigned long long *start, 391static void __init align_to_block_size(unsigned long long *start,
391 unsigned long long *size) 392 unsigned long long *size,
393 unsigned long long alignment)
392{ 394{
393 unsigned long long start_align, size_align, alignment; 395 unsigned long long start_align, size_align;
394 396
395 alignment = memory_block_size_bytes();
396 start_align = roundup(*start, alignment); 397 start_align = roundup(*start, alignment);
397 size_align = rounddown(*start + *size, alignment) - start_align; 398 size_align = rounddown(*start + *size, alignment) - start_align;
398 399
@@ -404,8 +405,8 @@ static void __init align_to_block_size(unsigned long long *start,
404 405
405static void __init add_memory_merged(u16 rn) 406static void __init add_memory_merged(u16 rn)
406{ 407{
408 unsigned long long start, size, addr, block_size;
407 static u16 first_rn, num; 409 static u16 first_rn, num;
408 unsigned long long start, size;
409 410
410 if (rn && first_rn && (first_rn + num == rn)) { 411 if (rn && first_rn && (first_rn + num == rn)) {
411 num++; 412 num++;
@@ -423,9 +424,12 @@ static void __init add_memory_merged(u16 rn)
423 goto skip_add; 424 goto skip_add;
424 if (memory_end_set && (start + size > memory_end)) 425 if (memory_end_set && (start + size > memory_end))
425 size = memory_end - start; 426 size = memory_end - start;
426 align_to_block_size(&start, &size); 427 block_size = memory_block_size_bytes();
427 if (size) 428 align_to_block_size(&start, &size, block_size);
428 add_memory(0, start, size); 429 if (!size)
430 goto skip_add;
431 for (addr = start; addr < start + size; addr += block_size)
432 add_memory(numa_pfn_to_nid(PFN_DOWN(addr)), addr, block_size);
429skip_add: 433skip_add:
430 first_rn = rn; 434 first_rn = rn;
431 num = 1; 435 num = 1;