s390/numa: add emulation support

NUMA emulation (aka fake NUMA) distributes the available memory to nodes without using real topology information about the physical memory of the machine. Splitting the system memory into nodes replicates the memory management structures for each node. Particularly each node has its own "mm locks" and its own "kswapd" task. For large systems, under certain conditions, this results in improved system performance and/or latency based on reduced pressure on the mm locks and the kswapd tasks. NUMA emulation distributes CPUs to nodes while respecting the original machine topology information. This is done by trying to avoid to separate CPUs which reside on the same book or even on the same MC. Because the current Linux scheduler code requires a stable cpu to node mapping, cores are pinned to nodes when the first CPU thread is set online. This patch is based on the initial implementation from Philipp Hachtmann. Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
author: Michael Holzheu <holzheu@linux.vnet.ibm.com> 2014-03-06 12:47:21 -0500
committer: Martin Schwidefsky <schwidefsky@de.ibm.com> 2015-08-04 08:06:53 -0400
commit: c29a7baf091fc6b2c9e40561030f8c62e6145a19 (patch)
tree: dfddc7a273858c32c9946857bfff2dc7779e64a9
parent: e8054b654bf5d4f549f4f24b708acce6d2718b1b (diff)
7 files changed, 569 insertions, 7 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 25510adb07d3..cb418dcc2d45 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -423,6 +423,43 @@ config NODES_SHIFT
          Specify the maximum number of NUMA nodes available on the target
          system. Increases memory reserved to accommodate various tables.
+menu "Select NUMA modes"
+        depends on NUMA
+config NUMA_EMU
+        bool "NUMA emulation"
+        default y
+        help
+          Numa emulation mode will split the available system memory into
+          equal chunks which then are distributed over the configured number
+          of nodes in a round-robin manner.
+          The number of fake nodes is limited by the number of available memory
+          chunks (i.e. memory size / fake size) and the number of supported
+          nodes in the kernel.
+          The CPUs are assigned to the nodes in a way that partially respects
+          the original machine topology (if supported by the machine).
+          Fair distribution of the CPUs is not guaranteed.
+config EMU_SIZE
+        hex "NUMA emulation memory chunk size"
+        default 0x10000000
+        range 0x400000 0x100000000
+        depends on NUMA_EMU
+        help
+          Select the default size by which the memory is chopped and then
+          assigned to emulated NUMA nodes.
+          This can be overridden by specifying
+          emu_size=<n>
+          on the kernel command line where also suffixes K, M, G, and T are
+          supported.
+endmenu
 config SCHED_MC
        def_bool n
diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h
index ea4edbfba9f6..2a0efc63b9e5 100644
--- a/arch/s390/include/asm/numa.h
+++ b/arch/s390/include/asm/numa.h
@@ -26,6 +26,10 @@ extern int numa_debug_enabled;
 static inline void numa_setup(void) { }
 static inline void numa_update_cpu_topology(void) { }
+static inline int numa_pfn_to_nid(unsigned long pfn)
+{
+        return 0;
+}
 #endif /* CONFIG_NUMA */
 #endif /* _ASM_S390_NUMA_H */
diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile
index 31372293b62e..f94ecaffa71b 100644
--- a/arch/s390/numa/Makefile
+++ b/arch/s390/numa/Makefile
@@ -1,2 +1,3 @@
 obj-y                   += numa.o
 obj-y                   += toptree.o
+obj-$(CONFIG_NUMA_EMU)  += mode_emu.o
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
new file mode 100644
index 000000000000..9d4e1e15a6f0
--- /dev/null
+++ b/arch/s390/numa/mode_emu.c
@@ -0,0 +1,511 @@
+/*
+ * NUMA support for s390
+ *
+ * NUMA emulation (aka fake NUMA) distributes the available memory to nodes
+ * without using real topology information about the physical memory of the
+ * machine.
+ *
+ * It distributes the available CPUs to nodes while respecting the original
+ * machine topology information. This is done by trying to avoid to separate
+ * CPUs which reside on the same book or even on the same MC.
+ *
+ * Because the current Linux scheduler code requires a stable cpu to node
+ * mapping, cores are pinned to nodes when the first CPU thread is set online.
+ *
+ * Copyright IBM Corp. 2015
+ */
+#define KMSG_COMPONENT "numa_emu"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
+#include <linux/node.h>
+#include <linux/memory.h>
+#include <asm/smp.h>
+#include <asm/topology.h>
+#include "numa_mode.h"
+#include "toptree.h"
+/* Distances between the different system components */
+#define DIST_EMPTY      0
+#define DIST_CORE       1
+#define DIST_MC         2
+#define DIST_BOOK       3
+#define DIST_MAX        4
+/* Node distance reported to common code */
+#define EMU_NODE_DIST   10
+/* Node ID for free (not yet pinned) cores */
+#define NODE_ID_FREE    -1
+/* Different levels of toptree */
+enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
+/* The two toptree IDs */
+enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
+/* Number of NUMA nodes */
+static int emu_nodes = 1;
+/* NUMA stripe size */
+static unsigned long emu_size;
+/* Pinned core to node mapping */
+static int cores_to_node_id[CONFIG_NR_CPUS];
+/* Total number of pinned cores */
+static int cores_total;
+/* Number of cores per node without extra cores */
+static int cores_per_node_target;
+/* Number of cores pinned to node */
+static int cores_per_node[MAX_NUMNODES];
+/*
+ * Pin a core to a node
+ */
+static void pin_core_to_node(int core_id, int node_id)
+{
+        if (cores_to_node_id[core_id] == NODE_ID_FREE) {
+                cores_per_node[node_id]++;
+                cores_to_node_id[core_id] = node_id;
+                cores_total++;
+        } else {
+                WARN_ON(cores_to_node_id[core_id] != node_id);
+        }
+}
+/*
+ * Number of pinned cores of a node
+ */
+static int cores_pinned(struct toptree *node)
+{
+        return cores_per_node[node->id];
+}
+/*
+ * ID of the node where the core is pinned (or NODE_ID_FREE)
+ */
+static int core_pinned_to_node_id(struct toptree *core)
+{
+        return cores_to_node_id[core->id];
+}
+/*
+ * Number of cores in the tree that are not yet pinned
+ */
+static int cores_free(struct toptree *tree)
+{
+        struct toptree *core;
+        int count = 0;
+        toptree_for_each(core, tree, CORE) {
+                if (core_pinned_to_node_id(core) == NODE_ID_FREE)
+                        count++;
+        }
+        return count;
+}
+/*
+ * Return node of core
+ */
+static struct toptree *core_node(struct toptree *core)
+{
+        return core->parent->parent->parent;
+}
+/*
+ * Return book of core
+ */
+static struct toptree *core_book(struct toptree *core)
+{
+        return core->parent->parent;
+}
+/*
+ * Return mc of core
+ */
+static struct toptree *core_mc(struct toptree *core)
+{
+        return core->parent;
+}
+/*
+ * Distance between two cores
+ */
+static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
+{
+        if (core_book(core1)->id != core_book(core2)->id)
+                return DIST_BOOK;
+        if (core_mc(core1)->id != core_mc(core2)->id)
+                return DIST_MC;
+        /* Same core or sibling on same MC */
+        return DIST_CORE;
+}
+/*
+ * Distance of a node to a core
+ */
+static int dist_node_to_core(struct toptree *node, struct toptree *core)
+{
+        struct toptree *core_node;
+        int dist_min = DIST_MAX;
+        toptree_for_each(core_node, node, CORE)
+                dist_min = min(dist_min, dist_core_to_core(core_node, core));
+        return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
+}
+/*
+ * Unify will delete empty nodes, therefore recreate nodes.
+ */
+static void toptree_unify_tree(struct toptree *tree)
+{
+        int nid;
+        toptree_unify(tree);
+        for (nid = 0; nid < emu_nodes; nid++)
+                toptree_get_child(tree, nid);
+}
+/*
+ * Find the best/nearest node for a given core and ensure that no node
+ * gets more than "cores_per_node_target + extra" cores.
+ */
+static struct toptree *node_for_core(struct toptree *numa, struct toptree *core,
+                                     int extra)
+{
+        struct toptree *node, *node_best = NULL;
+        int dist_cur, dist_best;
+        dist_best = DIST_MAX;
+        node_best = NULL;
+        toptree_for_each(node, numa, NODE) {
+                /* Already pinned cores must use their nodes */
+                if (core_pinned_to_node_id(core) == node->id) {
+                        node_best = node;
+                        break;
+                }
+                /* Skip nodes that already have enough cores */
+                if (cores_pinned(node) >= cores_per_node_target + extra)
+                        continue;
+                dist_cur = dist_node_to_core(node, core);
+                if (dist_cur < dist_best) {
+                        dist_best = dist_cur;
+                        node_best = node;
+                }
+        }
+        return node_best;
+}
+/*
+ * Find the best node for each core with respect to "extra" core count
+ */
+static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys,
+                                   int extra)
+{
+        struct toptree *node, *core, *tmp;
+        toptree_for_each_safe(core, tmp, phys, CORE) {
+                node = node_for_core(numa, core, extra);
+                if (!node)
+                        return;
+                toptree_move(core, node);
+                pin_core_to_node(core->id, node->id);
+        }
+}
+/*
+ * Move structures of given level to specified NUMA node
+ */
+static void move_level_to_numa_node(struct toptree *node, struct toptree *phys,
+                                    enum toptree_level level, bool perfect)
+{
+        struct toptree *cur, *tmp;
+        int cores_free;
+        toptree_for_each_safe(cur, tmp, phys, level) {
+                cores_free = cores_per_node_target - toptree_count(node, CORE);
+                if (perfect) {
+                        if (cores_free == toptree_count(cur, CORE))
+                                toptree_move(cur, node);
+                } else {
+                        if (cores_free >= toptree_count(cur, CORE))
+                                toptree_move(cur, node);
+                }
+        }
+}
+/*
+ * Move structures of a given level to NUMA nodes. If "perfect" is specified
+ * move only perfectly fitting structures. Otherwise move also smaller
+ * than needed structures.
+ */
+static void move_level_to_numa(struct toptree *numa, struct toptree *phys,
+                               enum toptree_level level, bool perfect)
+{
+        struct toptree *node;
+        toptree_for_each(node, numa, NODE)
+                move_level_to_numa_node(node, phys, level, perfect);
+}
+/*
+ * For the first run try to move the big structures
+ */
+static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
+{
+        struct toptree *core;
+        /* Always try to move perfectly fitting structures first */
+        move_level_to_numa(numa, phys, BOOK, true);
+        move_level_to_numa(numa, phys, BOOK, false);
+        move_level_to_numa(numa, phys, MC, true);
+        move_level_to_numa(numa, phys, MC, false);
+        /* Now pin all the moved cores */
+        toptree_for_each(core, numa, CORE)
+                pin_core_to_node(core->id, core_node(core)->id);
+}
+/*
+ * Allocate new topology and create required nodes
+ */
+static struct toptree *toptree_new(int id, int nodes)
+{
+        struct toptree *tree;
+        int nid;
+        tree = toptree_alloc(TOPOLOGY, id);
+        if (!tree)
+                goto fail;
+        for (nid = 0; nid < nodes; nid++) {
+                if (!toptree_get_child(tree, nid))
+                        goto fail;
+        }
+        return tree;
+fail:
+        panic("NUMA emulation could not allocate topology");
+}
+/*
+ * Move cores from physical topology into NUMA target topology
+ * and try to keep as much of the physical topology as possible.
+ */
+static struct toptree *toptree_to_numa(struct toptree *phys)
+{
+        static int first = 1;
+        struct toptree *numa;
+        cores_per_node_target = (cores_total + cores_free(phys)) / emu_nodes;
+        numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
+        if (first) {
+                toptree_to_numa_first(numa, phys);
+                first = 0;
+        }
+        toptree_to_numa_single(numa, phys, 0);
+        toptree_to_numa_single(numa, phys, 1);
+        toptree_unify_tree(numa);
+        WARN_ON(cpumask_weight(&phys->mask));
+        return numa;
+}
+/*
+ * Create a toptree out of the physical topology that we got from the hypervisor
+ */
+static struct toptree *toptree_from_topology(void)
+{
+        struct toptree *phys, *node, *book, *mc, *core;
+        struct cpu_topology_s390 *top;
+        int cpu;
+        phys = toptree_new(TOPTREE_ID_PHYS, 1);
+        for_each_online_cpu(cpu) {
+                top = &per_cpu(cpu_topology, cpu);
+                node = toptree_get_child(phys, 0);
+                book = toptree_get_child(node, top->book_id);
+                mc = toptree_get_child(book, top->socket_id);
+                core = toptree_get_child(mc, top->core_id);
+                if (!book || !mc || !core)
+                        panic("NUMA emulation could not allocate memory");
+                cpumask_set_cpu(cpu, &core->mask);
+                toptree_update_mask(mc);
+        }
+        return phys;
+}
+/*
+ * Add toptree core to topology and create correct CPU masks
+ */
+static void topology_add_core(struct toptree *core)
+{
+        struct cpu_topology_s390 *top;
+        int cpu;
+        for_each_cpu(cpu, &core->mask) {
+                top = &per_cpu(cpu_topology, cpu);
+                cpumask_copy(&top->thread_mask, &core->mask);
+                cpumask_copy(&top->core_mask, &core_mc(core)->mask);
+                cpumask_copy(&top->book_mask, &core_book(core)->mask);
+                cpumask_set_cpu(cpu, node_to_cpumask_map[core_node(core)->id]);
+                top->node_id = core_node(core)->id;
+        }
+}
+/*
+ * Apply toptree to topology and create CPU masks
+ */
+static void toptree_to_topology(struct toptree *numa)
+{
+        struct toptree *core;
+        int i;
+        /* Clear all node masks */
+        for (i = 0; i < MAX_NUMNODES; i++)
+                cpumask_clear(node_to_cpumask_map[i]);
+        /* Rebuild all masks */
+        toptree_for_each(core, numa, CORE)
+                topology_add_core(core);
+}
+/*
+ * Show the node to core mapping
+ */
+static void print_node_to_core_map(void)
+{
+        int nid, cid;
+        if (!numa_debug_enabled)
+                return;
+        printk(KERN_DEBUG "NUMA node to core mapping\n");
+        for (nid = 0; nid < emu_nodes; nid++) {
+                printk(KERN_DEBUG "  node %3d: ", nid);
+                for (cid = 0; cid < ARRAY_SIZE(cores_to_node_id); cid++) {
+                        if (cores_to_node_id[cid] == nid)
+                                printk(KERN_CONT "%d ", cid);
+                }
+                printk(KERN_CONT "\n");
+        }
+}
+/*
+ * Transfer physical topology into a NUMA topology and modify CPU masks
+ * according to the NUMA topology.
+ *
+ * This function is called under the CPU hotplug lock.
+ */
+static void emu_update_cpu_topology(void)
+{
+        struct toptree *phys, *numa;
+        phys = toptree_from_topology();
+        numa = toptree_to_numa(phys);
+        toptree_free(phys);
+        toptree_to_topology(numa);
+        toptree_free(numa);
+        print_node_to_core_map();
+}
+/*
+ * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
+ * alignment (needed for memory hotplug).
+ */
+static unsigned long emu_setup_size_adjust(unsigned long size)
+{
+        size = size ? : CONFIG_EMU_SIZE;
+        size = roundup(size, memory_block_size_bytes());
+        return size;
+}
+/*
+ * If we have not enough memory for the specified nodes, reduce the node count.
+ */
+static int emu_setup_nodes_adjust(int nodes)
+{
+        int nodes_max;
+        nodes_max = memblock.memory.total_size / emu_size;
+        nodes_max = max(nodes_max, 1);
+        if (nodes_max >= nodes)
+                return nodes;
+        pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
+        return nodes_max;
+}
+/*
+ * Early emu setup
+ */
+static void emu_setup(void)
+{
+        int i;
+        emu_size = emu_setup_size_adjust(emu_size);
+        emu_nodes = emu_setup_nodes_adjust(emu_nodes);
+        for (i = 0; i < ARRAY_SIZE(cores_to_node_id); i++)
+                cores_to_node_id[i] = NODE_ID_FREE;
+        pr_info("Creating %d nodes with memory stripe size %ld MB\n",
+                emu_nodes, emu_size >> 20);
+}
+/*
+ * Return node id for given page number
+ */
+static int emu_pfn_to_nid(unsigned long pfn)
+{
+        return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
+}
+/*
+ * Return stripe size
+ */
+static unsigned long emu_align(void)
+{
+        return emu_size;
+}
+/*
+ * Return distance between two nodes
+ */
+static int emu_distance(int node1, int node2)
+{
+        return (node1 != node2) * EMU_NODE_DIST;
+}
+/*
+ * Define callbacks for generic s390 NUMA infrastructure
+ */
+const struct numa_mode numa_mode_emu = {
+        .name = "emu",
+        .setup = emu_setup,
+        .update_cpu_topology = emu_update_cpu_topology,
+        .__pfn_to_nid = emu_pfn_to_nid,
+        .align = emu_align,
+        .distance = emu_distance,
+};
+/*
+ * Kernel parameter: emu_nodes=<n>
+ */
+static int __init early_parse_emu_nodes(char *p)
+{
+        int count;
+        if (kstrtoint(p, 0, &count) != 0 || count <= 0)
+                return 0;
+        if (count <= 0)
+                return 0;
+        emu_nodes = min(count, MAX_NUMNODES);
+        return 0;
+}
+early_param("emu_nodes", early_parse_emu_nodes);
+/*
+ * Kernel parameter: emu_size=[<n>[k|M|G|T]]
+ */
+static int __init early_parse_emu_size(char *p)
+{
+        emu_size = memparse(p, NULL);
+        return 0;
+}
+early_param("emu_size", early_parse_emu_size);
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
index 0416a3671e33..09b1d2355bd9 100644
--- a/arch/s390/numa/numa.c
+++ b/arch/s390/numa/numa.c
@@ -175,6 +175,10 @@ static int __init parse_numa(char *parm)
 {
        if (strcmp(parm, numa_mode_plain.name) == 0)
                mode = &numa_mode_plain;
+#ifdef CONFIG_NUMA_EMU
+        if (strcmp(parm, numa_mode_emu.name) == 0)
+                mode = &numa_mode_emu;
+#endif
        return 0;
 }
 early_param("numa", parse_numa);
diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h
index 775659848011..08953b0b1c7f 100644
--- a/arch/s390/numa/numa_mode.h
+++ b/arch/s390/numa/numa_mode.h
@@ -19,5 +19,6 @@ struct numa_mode {
 };
 extern const struct numa_mode numa_mode_plain;
+extern const struct numa_mode numa_mode_emu;
 #endif /* __S390_NUMA_MODE_H */
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index e9485fbbb373..806239c2cf2f 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -25,6 +25,7 @@
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/sclp.h>
+#include <asm/numa.h>
 #include "sclp.h"
@@ -388,11 +389,11 @@ static struct notifier_block sclp_mem_nb = {
 };
 static void __init align_to_block_size(unsigned long long *start,
-                                       unsigned long long *size)
+                                       unsigned long long *size,
+                                       unsigned long long alignment)
 {
-        unsigned long long start_align, size_align, alignment;
+        unsigned long long start_align, size_align;
-        alignment = memory_block_size_bytes();
        start_align = roundup(*start, alignment);
        size_align = rounddown(*start + *size, alignment) - start_align;
@@ -404,8 +405,8 @@ static void __init align_to_block_size(unsigned long long *start,
 static void __init add_memory_merged(u16 rn)
 {
+        unsigned long long start, size, addr, block_size;
        static u16 first_rn, num;
-        unsigned long long start, size;
        if (rn && first_rn && (first_rn + num == rn)) {
                num++;
@@ -423,9 +424,12 @@ static void __init add_memory_merged(u16 rn)
                goto skip_add;
        if (memory_end_set && (start + size > memory_end))
                size = memory_end - start;
-        align_to_block_size(&start, &size);
+        block_size = memory_block_size_bytes();
-        if (size)
+        align_to_block_size(&start, &size, block_size);
-                add_memory(0, start, size);
+        if (!size)
+                goto skip_add;
+        for (addr = start; addr < start + size; addr += block_size)
+                add_memory(numa_pfn_to_nid(PFN_DOWN(addr)), addr, block_size);
 skip_add:
        first_rn = rn;
        num = 1;
author	Michael Holzheu <holzheu@linux.vnet.ibm.com>	2014-03-06 12:47:21 -0500
committer	Martin Schwidefsky <schwidefsky@de.ibm.com>	2015-08-04 08:06:53 -0400
commit	c29a7baf091fc6b2c9e40561030f8c62e6145a19 (patch)
tree	dfddc7a273858c32c9946857bfff2dc7779e64a9
parent	e8054b654bf5d4f549f4f24b708acce6d2718b1b (diff)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 25510adb07d3..cb418dcc2d45 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig
@@ -423,6 +423,43 @@ config NODES_SHIFT
423	Specify the maximum number of NUMA nodes available on the target	423	Specify the maximum number of NUMA nodes available on the target
424	system. Increases memory reserved to accommodate various tables.	424	system. Increases memory reserved to accommodate various tables.
425		425
		426	menu "Select NUMA modes"
		427	depends on NUMA
		428
		429	config NUMA_EMU
		430	bool "NUMA emulation"
		431	default y
		432	help
		433	Numa emulation mode will split the available system memory into
		434	equal chunks which then are distributed over the configured number
		435	of nodes in a round-robin manner.
		436
		437	The number of fake nodes is limited by the number of available memory
		438	chunks (i.e. memory size / fake size) and the number of supported
		439	nodes in the kernel.
		440
		441	The CPUs are assigned to the nodes in a way that partially respects
		442	the original machine topology (if supported by the machine).
		443	Fair distribution of the CPUs is not guaranteed.
		444
		445	config EMU_SIZE
		446	hex "NUMA emulation memory chunk size"
		447	default 0x10000000
		448	range 0x400000 0x100000000
		449	depends on NUMA_EMU
		450	help
		451	Select the default size by which the memory is chopped and then
		452	assigned to emulated NUMA nodes.
		453
		454	This can be overridden by specifying
		455
		456	emu_size=<n>
		457
		458	on the kernel command line where also suffixes K, M, G, and T are
		459	supported.
		460
		461	endmenu
		462
426	config SCHED_MC	463	config SCHED_MC
427	def_bool n	464	def_bool n
428		465


diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h index ea4edbfba9f6..2a0efc63b9e5 100644 --- a/arch/s390/include/asm/numa.h +++ b/arch/s390/include/asm/numa.h
@@ -26,6 +26,10 @@ extern int numa_debug_enabled;
26		26
27	static inline void numa_setup(void) { }	27	static inline void numa_setup(void) { }
28	static inline void numa_update_cpu_topology(void) { }	28	static inline void numa_update_cpu_topology(void) { }
		29	static inline int numa_pfn_to_nid(unsigned long pfn)
		30	{
		31	return 0;
		32	}
29		33
30	#endif /* CONFIG_NUMA */	34	#endif /* CONFIG_NUMA */
31	#endif /* _ASM_S390_NUMA_H */	35	#endif /* _ASM_S390_NUMA_H */


diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile index 31372293b62e..f94ecaffa71b 100644 --- a/arch/s390/numa/Makefile +++ b/arch/s390/numa/Makefile
@@ -1,2 +1,3 @@
1	obj-y += numa.o	1	obj-y += numa.o
2	obj-y += toptree.o	2	obj-y += toptree.o
		3	obj-$(CONFIG_NUMA_EMU) += mode_emu.o


diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c new file mode 100644 index 000000000000..9d4e1e15a6f0 --- /dev/null +++ b/arch/s390/numa/mode_emu.c
@@ -0,0 +1,511 @@
		1	/*
		2	* NUMA support for s390
		3	*
		4	* NUMA emulation (aka fake NUMA) distributes the available memory to nodes
		5	* without using real topology information about the physical memory of the
		6	* machine.
		7	*
		8	* It distributes the available CPUs to nodes while respecting the original
		9	* machine topology information. This is done by trying to avoid to separate
		10	* CPUs which reside on the same book or even on the same MC.
		11	*
		12	* Because the current Linux scheduler code requires a stable cpu to node
		13	* mapping, cores are pinned to nodes when the first CPU thread is set online.
		14	*
		15	* Copyright IBM Corp. 2015
		16	*/
		17
		18	#define KMSG_COMPONENT "numa_emu"
		19	#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
		20
		21	#include <linux/kernel.h>
		22	#include <linux/cpumask.h>
		23	#include <linux/memblock.h>
		24	#include <linux/node.h>
		25	#include <linux/memory.h>
		26	#include <asm/smp.h>
		27	#include <asm/topology.h>
		28	#include "numa_mode.h"
		29	#include "toptree.h"
		30
		31	/* Distances between the different system components */
		32	#define DIST_EMPTY 0
		33	#define DIST_CORE 1
		34	#define DIST_MC 2
		35	#define DIST_BOOK 3
		36	#define DIST_MAX 4
		37
		38	/* Node distance reported to common code */
		39	#define EMU_NODE_DIST 10
		40
		41	/* Node ID for free (not yet pinned) cores */
		42	#define NODE_ID_FREE -1
		43
		44	/* Different levels of toptree */
		45	enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
		46
		47	/* The two toptree IDs */
		48	enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
		49
		50	/* Number of NUMA nodes */
		51	static int emu_nodes = 1;
		52	/* NUMA stripe size */
		53	static unsigned long emu_size;
		54	/* Pinned core to node mapping */
		55	static int cores_to_node_id[CONFIG_NR_CPUS];
		56	/* Total number of pinned cores */
		57	static int cores_total;
		58	/* Number of cores per node without extra cores */
		59	static int cores_per_node_target;
		60	/* Number of cores pinned to node */
		61	static int cores_per_node[MAX_NUMNODES];
		62
		63	/*
		64	* Pin a core to a node
		65	*/
		66	static void pin_core_to_node(int core_id, int node_id)
		67	{
		68	if (cores_to_node_id[core_id] == NODE_ID_FREE) {
		69	cores_per_node[node_id]++;
		70	cores_to_node_id[core_id] = node_id;
		71	cores_total++;
		72	} else {
		73	WARN_ON(cores_to_node_id[core_id] != node_id);
		74	}
		75	}
		76
		77	/*
		78	* Number of pinned cores of a node
		79	*/
		80	static int cores_pinned(struct toptree *node)
		81	{
		82	return cores_per_node[node->id];
		83	}
		84
		85	/*
		86	* ID of the node where the core is pinned (or NODE_ID_FREE)
		87	*/
		88	static int core_pinned_to_node_id(struct toptree *core)
		89	{
		90	return cores_to_node_id[core->id];
		91	}
		92
		93	/*
		94	* Number of cores in the tree that are not yet pinned
		95	*/
		96	static int cores_free(struct toptree *tree)
		97	{
		98	struct toptree *core;
		99	int count = 0;
		100
		101	toptree_for_each(core, tree, CORE) {
		102	if (core_pinned_to_node_id(core) == NODE_ID_FREE)
		103	count++;
		104	}
		105	return count;
		106	}
		107
		108	/*
		109	* Return node of core
		110	*/
		111	static struct toptree core_node(struct toptree core)
		112	{
		113	return core->parent->parent->parent;
		114	}
		115
		116	/*
		117	* Return book of core
		118	*/
		119	static struct toptree core_book(struct toptree core)
		120	{
		121	return core->parent->parent;
		122	}
		123
		124	/*
		125	* Return mc of core
		126	*/
		127	static struct toptree core_mc(struct toptree core)
		128	{
		129	return core->parent;
		130	}
		131
		132	/*
		133	* Distance between two cores
		134	*/
		135	static int dist_core_to_core(struct toptree core1, struct toptree core2)
		136	{
		137	if (core_book(core1)->id != core_book(core2)->id)
		138	return DIST_BOOK;
		139	if (core_mc(core1)->id != core_mc(core2)->id)
		140	return DIST_MC;
		141	/* Same core or sibling on same MC */
		142	return DIST_CORE;
		143	}
		144
		145	/*
		146	* Distance of a node to a core
		147	*/
		148	static int dist_node_to_core(struct toptree node, struct toptree core)
		149	{
		150	struct toptree *core_node;
		151	int dist_min = DIST_MAX;
		152
		153	toptree_for_each(core_node, node, CORE)
		154	dist_min = min(dist_min, dist_core_to_core(core_node, core));
		155	return dist_min == DIST_MAX ? DIST_EMPTY : dist_min;
		156	}
		157
		158	/*
		159	* Unify will delete empty nodes, therefore recreate nodes.
		160	*/
		161	static void toptree_unify_tree(struct toptree *tree)
		162	{
		163	int nid;
		164
		165	toptree_unify(tree);
		166	for (nid = 0; nid < emu_nodes; nid++)
		167	toptree_get_child(tree, nid);
		168	}
		169
		170	/*
		171	* Find the best/nearest node for a given core and ensure that no node
		172	* gets more than "cores_per_node_target + extra" cores.
		173	*/
		174	static struct toptree node_for_core(struct toptree numa, struct toptree *core,
		175	int extra)
		176	{
		177	struct toptree node, node_best = NULL;
		178	int dist_cur, dist_best;
		179
		180	dist_best = DIST_MAX;
		181	node_best = NULL;
		182	toptree_for_each(node, numa, NODE) {
		183	/* Already pinned cores must use their nodes */
		184	if (core_pinned_to_node_id(core) == node->id) {
		185	node_best = node;
		186	break;
		187	}
		188	/* Skip nodes that already have enough cores */
		189	if (cores_pinned(node) >= cores_per_node_target + extra)
		190	continue;
		191	dist_cur = dist_node_to_core(node, core);
		192	if (dist_cur < dist_best) {
		193	dist_best = dist_cur;
		194	node_best = node;
		195	}
		196	}
		197	return node_best;
		198	}
		199
		200	/*
		201	* Find the best node for each core with respect to "extra" core count
		202	*/
		203	static void toptree_to_numa_single(struct toptree numa, struct toptree phys,
		204	int extra)
		205	{
		206	struct toptree node, core, *tmp;
		207
		208	toptree_for_each_safe(core, tmp, phys, CORE) {
		209	node = node_for_core(numa, core, extra);
		210	if (!node)
		211	return;
		212	toptree_move(core, node);
		213	pin_core_to_node(core->id, node->id);
		214	}
		215	}
		216
		217	/*
		218	* Move structures of given level to specified NUMA node
		219	*/
		220	static void move_level_to_numa_node(struct toptree node, struct toptree phys,
		221	enum toptree_level level, bool perfect)
		222	{
		223	struct toptree cur, tmp;
		224	int cores_free;
		225
		226	toptree_for_each_safe(cur, tmp, phys, level) {
		227	cores_free = cores_per_node_target - toptree_count(node, CORE);
		228	if (perfect) {
		229	if (cores_free == toptree_count(cur, CORE))
		230	toptree_move(cur, node);
		231	} else {
		232	if (cores_free >= toptree_count(cur, CORE))
		233	toptree_move(cur, node);
		234	}
		235	}
		236	}
		237
		238	/*
		239	* Move structures of a given level to NUMA nodes. If "perfect" is specified
		240	* move only perfectly fitting structures. Otherwise move also smaller
		241	* than needed structures.
		242	*/
		243	static void move_level_to_numa(struct toptree numa, struct toptree phys,
		244	enum toptree_level level, bool perfect)
		245	{
		246	struct toptree *node;
		247
		248	toptree_for_each(node, numa, NODE)
		249	move_level_to_numa_node(node, phys, level, perfect);
		250	}
		251
		252	/*
		253	* For the first run try to move the big structures
		254	*/
		255	static void toptree_to_numa_first(struct toptree numa, struct toptree phys)
		256	{
		257	struct toptree *core;
		258
		259	/* Always try to move perfectly fitting structures first */
		260	move_level_to_numa(numa, phys, BOOK, true);
		261	move_level_to_numa(numa, phys, BOOK, false);
		262	move_level_to_numa(numa, phys, MC, true);
		263	move_level_to_numa(numa, phys, MC, false);
		264	/* Now pin all the moved cores */
		265	toptree_for_each(core, numa, CORE)
		266	pin_core_to_node(core->id, core_node(core)->id);
		267	}
		268
		269	/*
		270	* Allocate new topology and create required nodes
		271	*/
		272	static struct toptree *toptree_new(int id, int nodes)
		273	{
		274	struct toptree *tree;
		275	int nid;
		276
		277	tree = toptree_alloc(TOPOLOGY, id);
		278	if (!tree)
		279	goto fail;
		280	for (nid = 0; nid < nodes; nid++) {
		281	if (!toptree_get_child(tree, nid))
		282	goto fail;
		283	}
		284	return tree;
		285	fail:
		286	panic("NUMA emulation could not allocate topology");
		287	}
		288
		289	/*
		290	* Move cores from physical topology into NUMA target topology
		291	* and try to keep as much of the physical topology as possible.
		292	*/
		293	static struct toptree toptree_to_numa(struct toptree phys)
		294	{
		295	static int first = 1;
		296	struct toptree *numa;
		297
		298	cores_per_node_target = (cores_total + cores_free(phys)) / emu_nodes;
		299	numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes);
		300	if (first) {
		301	toptree_to_numa_first(numa, phys);
		302	first = 0;
		303	}
		304	toptree_to_numa_single(numa, phys, 0);
		305	toptree_to_numa_single(numa, phys, 1);
		306	toptree_unify_tree(numa);
		307
		308	WARN_ON(cpumask_weight(&phys->mask));
		309	return numa;
		310	}
		311
		312	/*
		313	* Create a toptree out of the physical topology that we got from the hypervisor
		314	*/
		315	static struct toptree *toptree_from_topology(void)
		316	{
		317	struct toptree phys, node, book, mc, *core;
		318	struct cpu_topology_s390 *top;
		319	int cpu;
		320
		321	phys = toptree_new(TOPTREE_ID_PHYS, 1);
		322
		323	for_each_online_cpu(cpu) {
		324	top = &per_cpu(cpu_topology, cpu);
		325	node = toptree_get_child(phys, 0);
		326	book = toptree_get_child(node, top->book_id);
		327	mc = toptree_get_child(book, top->socket_id);
		328	core = toptree_get_child(mc, top->core_id);
		329	if (!book \|\| !mc \|\| !core)
		330	panic("NUMA emulation could not allocate memory");
		331	cpumask_set_cpu(cpu, &core->mask);
		332	toptree_update_mask(mc);
		333	}
		334	return phys;
		335	}
		336
		337	/*
		338	* Add toptree core to topology and create correct CPU masks
		339	*/
		340	static void topology_add_core(struct toptree *core)
		341	{
		342	struct cpu_topology_s390 *top;
		343	int cpu;
		344
		345	for_each_cpu(cpu, &core->mask) {
		346	top = &per_cpu(cpu_topology, cpu);
		347	cpumask_copy(&top->thread_mask, &core->mask);
		348	cpumask_copy(&top->core_mask, &core_mc(core)->mask);
		349	cpumask_copy(&top->book_mask, &core_book(core)->mask);
		350	cpumask_set_cpu(cpu, node_to_cpumask_map[core_node(core)->id]);
		351	top->node_id = core_node(core)->id;
		352	}
		353	}
		354
		355	/*
		356	* Apply toptree to topology and create CPU masks
		357	*/
		358	static void toptree_to_topology(struct toptree *numa)
		359	{
		360	struct toptree *core;
		361	int i;
		362
		363	/* Clear all node masks */
		364	for (i = 0; i < MAX_NUMNODES; i++)
		365	cpumask_clear(node_to_cpumask_map[i]);
		366
		367	/* Rebuild all masks */
		368	toptree_for_each(core, numa, CORE)
		369	topology_add_core(core);
		370	}
		371
		372	/*
		373	* Show the node to core mapping
		374	*/
		375	static void print_node_to_core_map(void)
		376	{
		377	int nid, cid;
		378
		379	if (!numa_debug_enabled)
		380	return;
		381	printk(KERN_DEBUG "NUMA node to core mapping\n");
		382	for (nid = 0; nid < emu_nodes; nid++) {
		383	printk(KERN_DEBUG " node %3d: ", nid);
		384	for (cid = 0; cid < ARRAY_SIZE(cores_to_node_id); cid++) {
		385	if (cores_to_node_id[cid] == nid)
		386	printk(KERN_CONT "%d ", cid);
		387	}
		388	printk(KERN_CONT "\n");
		389	}
		390	}
		391
		392	/*
		393	* Transfer physical topology into a NUMA topology and modify CPU masks
		394	* according to the NUMA topology.
		395	*
		396	* This function is called under the CPU hotplug lock.
		397	*/
		398	static void emu_update_cpu_topology(void)
		399	{
		400	struct toptree phys, numa;
		401
		402	phys = toptree_from_topology();
		403	numa = toptree_to_numa(phys);
		404	toptree_free(phys);
		405	toptree_to_topology(numa);
		406	toptree_free(numa);
		407	print_node_to_core_map();
		408	}
		409
		410	/*
		411	* If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum
		412	* alignment (needed for memory hotplug).
		413	*/
		414	static unsigned long emu_setup_size_adjust(unsigned long size)
		415	{
		416	size = size ? : CONFIG_EMU_SIZE;
		417	size = roundup(size, memory_block_size_bytes());
		418	return size;
		419	}
		420
		421	/*
		422	* If we have not enough memory for the specified nodes, reduce the node count.
		423	*/
		424	static int emu_setup_nodes_adjust(int nodes)
		425	{
		426	int nodes_max;
		427
		428	nodes_max = memblock.memory.total_size / emu_size;
		429	nodes_max = max(nodes_max, 1);
		430	if (nodes_max >= nodes)
		431	return nodes;
		432	pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes);
		433	return nodes_max;
		434	}
		435
		436	/*
		437	* Early emu setup
		438	*/
		439	static void emu_setup(void)
		440	{
		441	int i;
		442
		443	emu_size = emu_setup_size_adjust(emu_size);
		444	emu_nodes = emu_setup_nodes_adjust(emu_nodes);
		445	for (i = 0; i < ARRAY_SIZE(cores_to_node_id); i++)
		446	cores_to_node_id[i] = NODE_ID_FREE;
		447	pr_info("Creating %d nodes with memory stripe size %ld MB\n",
		448	emu_nodes, emu_size >> 20);
		449	}
		450
		451	/*
		452	* Return node id for given page number
		453	*/
		454	static int emu_pfn_to_nid(unsigned long pfn)
		455	{
		456	return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes;
		457	}
		458
		459	/*
		460	* Return stripe size
		461	*/
		462	static unsigned long emu_align(void)
		463	{
		464	return emu_size;
		465	}
		466
		467	/*
		468	* Return distance between two nodes
		469	*/
		470	static int emu_distance(int node1, int node2)
		471	{
		472	return (node1 != node2) * EMU_NODE_DIST;
		473	}
		474
		475	/*
		476	* Define callbacks for generic s390 NUMA infrastructure
		477	*/
		478	const struct numa_mode numa_mode_emu = {
		479	.name = "emu",
		480	.setup = emu_setup,
		481	.update_cpu_topology = emu_update_cpu_topology,
		482	.__pfn_to_nid = emu_pfn_to_nid,
		483	.align = emu_align,
		484	.distance = emu_distance,
		485	};
		486
		487	/*
		488	* Kernel parameter: emu_nodes=<n>
		489	*/
		490	static int __init early_parse_emu_nodes(char *p)
		491	{
		492	int count;
		493
		494	if (kstrtoint(p, 0, &count) != 0 \|\| count <= 0)
		495	return 0;
		496	if (count <= 0)
		497	return 0;
		498	emu_nodes = min(count, MAX_NUMNODES);
		499	return 0;
		500	}
		501	early_param("emu_nodes", early_parse_emu_nodes);
		502
		503	/*
		504	* Kernel parameter: emu_size=[<n>[k\|M\|G\|T]]
		505	*/
		506	static int __init early_parse_emu_size(char *p)
		507	{
		508	emu_size = memparse(p, NULL);
		509	return 0;
		510	}
		511	early_param("emu_size", early_parse_emu_size);


diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c index 0416a3671e33..09b1d2355bd9 100644 --- a/arch/s390/numa/numa.c +++ b/arch/s390/numa/numa.c
@@ -175,6 +175,10 @@ static int __init parse_numa(char *parm)
175	{	175	{
176	if (strcmp(parm, numa_mode_plain.name) == 0)	176	if (strcmp(parm, numa_mode_plain.name) == 0)
177	mode = &numa_mode_plain;	177	mode = &numa_mode_plain;
		178	#ifdef CONFIG_NUMA_EMU
		179	if (strcmp(parm, numa_mode_emu.name) == 0)
		180	mode = &numa_mode_emu;
		181	#endif
178	return 0;	182	return 0;
179	}	183	}
180	early_param("numa", parse_numa);	184	early_param("numa", parse_numa);


diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h index 775659848011..08953b0b1c7f 100644 --- a/arch/s390/numa/numa_mode.h +++ b/arch/s390/numa/numa_mode.h
@@ -19,5 +19,6 @@ struct numa_mode {
19	};	19	};
20		20
21	extern const struct numa_mode numa_mode_plain;	21	extern const struct numa_mode numa_mode_plain;
		22	extern const struct numa_mode numa_mode_emu;
22		23
23	#endif /* __S390_NUMA_MODE_H */	24	#endif /* __S390_NUMA_MODE_H */


diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c index e9485fbbb373..806239c2cf2f 100644 --- a/drivers/s390/char/sclp_cmd.c +++ b/drivers/s390/char/sclp_cmd.c
@@ -25,6 +25,7 @@
25	#include <asm/setup.h>	25	#include <asm/setup.h>
26	#include <asm/page.h>	26	#include <asm/page.h>
27	#include <asm/sclp.h>	27	#include <asm/sclp.h>
		28	#include <asm/numa.h>
28		29
29	#include "sclp.h"	30	#include "sclp.h"
30		31
@@ -388,11 +389,11 @@ static struct notifier_block sclp_mem_nb = {
388	};	389	};
389		390
390	static void __init align_to_block_size(unsigned long long *start,	391	static void __init align_to_block_size(unsigned long long *start,
391	unsigned long long *size)	392	unsigned long long *size,
		393	unsigned long long alignment)
392	{	394	{
393	unsigned long long start_align, size_align, alignment;	395	unsigned long long start_align, size_align;
394		396
395	alignment = memory_block_size_bytes();
396	start_align = roundup(*start, alignment);	397	start_align = roundup(*start, alignment);
397	size_align = rounddown(start + size, alignment) - start_align;	398	size_align = rounddown(start + size, alignment) - start_align;
398		399
@@ -404,8 +405,8 @@ static void __init align_to_block_size(unsigned long long *start,
404		405
405	static void __init add_memory_merged(u16 rn)	406	static void __init add_memory_merged(u16 rn)
406	{	407	{
		408	unsigned long long start, size, addr, block_size;
407	static u16 first_rn, num;	409	static u16 first_rn, num;
408	unsigned long long start, size;
409		410
410	if (rn && first_rn && (first_rn + num == rn)) {	411	if (rn && first_rn && (first_rn + num == rn)) {
411	num++;	412	num++;
@@ -423,9 +424,12 @@ static void __init add_memory_merged(u16 rn)
423	goto skip_add;	424	goto skip_add;
424	if (memory_end_set && (start + size > memory_end))	425	if (memory_end_set && (start + size > memory_end))
425	size = memory_end - start;	426	size = memory_end - start;
426	align_to_block_size(&start, &size);	427	block_size = memory_block_size_bytes();
427	if (size)	428	align_to_block_size(&start, &size, block_size);
428	add_memory(0, start, size);	429	if (!size)
		430	goto skip_add;
		431	for (addr = start; addr < start + size; addr += block_size)
		432	add_memory(numa_pfn_to_nid(PFN_DOWN(addr)), addr, block_size);
429	skip_add:	433	skip_add:
430	first_rn = rn;	434	first_rn = rn;
431	num = 1;	435	num = 1;