sparc64: fix and optimize irq distribution

irq_choose_cpu() should compare the affinity mask against cpu_online_map rather than CPU_MASK_ALL, since irq_select_affinity() sets the interrupt's affinity mask to cpu_online_map "and" CPU_MASK_ALL (which ends up being just cpu_online_map). The mask comparison in irq_choose_cpu() will always fail since the two masks are not the same. So the CPU chosen is the first CPU in the intersection of cpu_online_map and CPU_MASK_ALL, which is always CPU0. That means all interrupts are reassigned to CPU0... Distributing interrupts to CPUs in a linearly increasing round robin fashion is not optimal for the UltraSPARC T1/T2. Also, the irq_rover in irq_choose_cpu() causes an interrupt to be assigned to a different processor each time the interrupt is allocated and released. This may lead to an unbalanced distribution over time. A static mapping of interrupts to processors is done to optimize and balance interrupt distribution. For the T1/T2, interrupts are spread to different cores first, and then to strands within a core. The following is some benchmarks showing the effects of interrupt distribution on a T2. The test was done with iperf using a pair of T5220 boxes, each with a 10GBe NIU (XAUI) connected back to back. TCP | Stock Linear RR IRQ Optimized IRQ Streams | 2.6.30-rc5 Distribution Distribution | GBits/sec GBits/sec GBits/sec --------+----------------------------------------- 1 0.839 0.862 0.868 8 1.16 4.96 5.88 16 1.15 6.40 8.04 100 1.09 7.28 8.68 Signed-off-by: Hong H. Pham <hong.pham@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Hong H. Pham <hong.pham@windriver.com> 2009-06-04 05:10:11 -0400
committer: David S. Miller <davem@davemloft.net> 2009-06-16 07:56:28 -0400
commit: 280ff97494e0fef4124bee5c52e39b23a18dd283 (patch)
tree: e906ca3c5e0a6238882d181ab5b01fb3f40ba5df /arch/sparc
parent: 4fd78a5f1edf62ab1ca3d23efee4a8a336edb2b6 (diff)
5 files changed, 456 insertions, 25 deletions
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 54742e58831c..47029c66b17a 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_SPARC64)   += sstate.o
 obj-$(CONFIG_SPARC64)   += mdesc.o
 obj-$(CONFIG_SPARC64)   += pcr.o
 obj-$(CONFIG_SPARC64)   += nmi.o
+obj-$(CONFIG_SPARC64_SMP) += cpumap.o
 # sparc32 do not use GENERIC_HARDIRQS but uses the generic devres implementation
 obj-$(CONFIG_SPARC32)     += devres.o
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
new file mode 100644
index 000000000000..7430ed080b23
--- /dev/null
+++ b/arch/sparc/kernel/cpumap.c
@@ -0,0 +1,431 @@
+/* cpumap.c: used for optimizing CPU assignment
+ *
+ * Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com>
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <asm/cpudata.h>
+#include "cpumap.h"
+enum {
+        CPUINFO_LVL_ROOT = 0,
+        CPUINFO_LVL_NODE,
+        CPUINFO_LVL_CORE,
+        CPUINFO_LVL_PROC,
+        CPUINFO_LVL_MAX,
+};
+enum {
+        ROVER_NO_OP              = 0,
+        /* Increment rover every time level is visited */
+        ROVER_INC_ON_VISIT       = 1 << 0,
+        /* Increment parent's rover every time rover wraps around */
+        ROVER_INC_PARENT_ON_LOOP = 1 << 1,
+};
+struct cpuinfo_node {
+        int id;
+        int level;
+        int num_cpus;    /* Number of CPUs in this hierarchy */
+        int parent_index;
+        int child_start; /* Array index of the first child node */
+        int child_end;   /* Array index of the last child node */
+        int rover;       /* Child node iterator */
+};
+struct cpuinfo_level {
+        int start_index; /* Index of first node of a level in a cpuinfo tree */
+        int end_index;   /* Index of last node of a level in a cpuinfo tree */
+        int num_nodes;   /* Number of nodes in a level in a cpuinfo tree */
+};
+struct cpuinfo_tree {
+        int total_nodes;
+        /* Offsets into nodes[] for each level of the tree */
+        struct cpuinfo_level level[CPUINFO_LVL_MAX];
+        struct cpuinfo_node  nodes[0];
+};
+static struct cpuinfo_tree *cpuinfo_tree;
+static u16 cpu_distribution_map[NR_CPUS];
+static DEFINE_SPINLOCK(cpu_map_lock);
+/* Niagara optimized cpuinfo tree traversal. */
+static const int niagara_iterate_method[] = {
+        [CPUINFO_LVL_ROOT] = ROVER_NO_OP,
+        /* Strands (or virtual CPUs) within a core may not run concurrently
+         * on the Niagara, as instruction pipeline(s) are shared.  Distribute
+         * work to strands in different cores first for better concurrency.
+         * Go to next NUMA node when all cores are used.
+         */
+        [CPUINFO_LVL_NODE] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP,
+        /* Strands are grouped together by proc_id in cpuinfo_sparc, i.e.
+         * a proc_id represents an instruction pipeline.  Distribute work to
+         * strands in different proc_id groups if the core has multiple
+         * instruction pipelines (e.g. the Niagara 2/2+ has two).
+         */
+        [CPUINFO_LVL_CORE] = ROVER_INC_ON_VISIT,
+        /* Pick the next strand in the proc_id group. */
+        [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT,
+};
+/* Generic cpuinfo tree traversal.  Distribute work round robin across NUMA
+ * nodes.
+ */
+static const int generic_iterate_method[] = {
+        [CPUINFO_LVL_ROOT] = ROVER_INC_ON_VISIT,
+        [CPUINFO_LVL_NODE] = ROVER_NO_OP,
+        [CPUINFO_LVL_CORE] = ROVER_INC_PARENT_ON_LOOP,
+        [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP,
+};
+static int cpuinfo_id(int cpu, int level)
+{
+        int id;
+        switch (level) {
+        case CPUINFO_LVL_ROOT:
+                id = 0;
+                break;
+        case CPUINFO_LVL_NODE:
+                id = cpu_to_node(cpu);
+                break;
+        case CPUINFO_LVL_CORE:
+                id = cpu_data(cpu).core_id;
+                break;
+        case CPUINFO_LVL_PROC:
+                id = cpu_data(cpu).proc_id;
+                break;
+        default:
+                id = -EINVAL;
+        }
+        return id;
+}
+/*
+ * Enumerate the CPU information in __cpu_data to determine the start index,
+ * end index, and number of nodes for each level in the cpuinfo tree.  The
+ * total number of cpuinfo nodes required to build the tree is returned.
+ */
+static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
+{
+        int prev_id[CPUINFO_LVL_MAX];
+        int i, n, num_nodes;
+        for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) {
+                struct cpuinfo_level *lv = &tree_level[i];
+                prev_id[i] = -1;
+                lv->start_index = lv->end_index = lv->num_nodes = 0;
+        }
+        num_nodes = 1; /* Include the root node */
+        for (i = 0; i < num_possible_cpus(); i++) {
+                if (!cpu_online(i))
+                        continue;
+                n = cpuinfo_id(i, CPUINFO_LVL_NODE);
+                if (n > prev_id[CPUINFO_LVL_NODE]) {
+                        tree_level[CPUINFO_LVL_NODE].num_nodes++;
+                        prev_id[CPUINFO_LVL_NODE] = n;
+                        num_nodes++;
+                }
+                n = cpuinfo_id(i, CPUINFO_LVL_CORE);
+                if (n > prev_id[CPUINFO_LVL_CORE]) {
+                        tree_level[CPUINFO_LVL_CORE].num_nodes++;
+                        prev_id[CPUINFO_LVL_CORE] = n;
+                        num_nodes++;
+                }
+                n = cpuinfo_id(i, CPUINFO_LVL_PROC);
+                if (n > prev_id[CPUINFO_LVL_PROC]) {
+                        tree_level[CPUINFO_LVL_PROC].num_nodes++;
+                        prev_id[CPUINFO_LVL_PROC] = n;
+                        num_nodes++;
+                }
+        }
+        tree_level[CPUINFO_LVL_ROOT].num_nodes = 1;
+        n = tree_level[CPUINFO_LVL_NODE].num_nodes;
+        tree_level[CPUINFO_LVL_NODE].start_index = 1;
+        tree_level[CPUINFO_LVL_NODE].end_index   = n;
+        n++;
+        tree_level[CPUINFO_LVL_CORE].start_index = n;
+        n += tree_level[CPUINFO_LVL_CORE].num_nodes;
+        tree_level[CPUINFO_LVL_CORE].end_index   = n - 1;
+        tree_level[CPUINFO_LVL_PROC].start_index = n;
+        n += tree_level[CPUINFO_LVL_PROC].num_nodes;
+        tree_level[CPUINFO_LVL_PROC].end_index   = n - 1;
+        return num_nodes;
+}
+/* Build a tree representation of the CPU hierarchy using the per CPU
+ * information in __cpu_data.  Entries in __cpu_data[0..NR_CPUS] are
+ * assumed to be sorted in ascending order based on node, core_id, and
+ * proc_id (in order of significance).
+ */
+static struct cpuinfo_tree *build_cpuinfo_tree(void)
+{
+        struct cpuinfo_tree *new_tree;
+        struct cpuinfo_node *node;
+        struct cpuinfo_level tmp_level[CPUINFO_LVL_MAX];
+        int num_cpus[CPUINFO_LVL_MAX];
+        int level_rover[CPUINFO_LVL_MAX];
+        int prev_id[CPUINFO_LVL_MAX];
+        int n, id, cpu, prev_cpu, last_cpu, level;
+        n = enumerate_cpuinfo_nodes(tmp_level);
+        new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
+                           (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
+        if (!new_tree)
+                return NULL;
+        new_tree->total_nodes = n;
+        memcpy(&new_tree->level, tmp_level, sizeof(tmp_level));
+        prev_cpu = cpu = first_cpu(cpu_online_map);
+        /* Initialize all levels in the tree with the first CPU */
+        for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) {
+                n = new_tree->level[level].start_index;
+                level_rover[level] = n;
+                node = &new_tree->nodes[n];
+                id = cpuinfo_id(cpu, level);
+                if (unlikely(id < 0)) {
+                        kfree(new_tree);
+                        return NULL;
+                }
+                node->id = id;
+                node->level = level;
+                node->num_cpus = 1;
+                node->parent_index = (level > CPUINFO_LVL_ROOT)
+                    ? new_tree->level[level - 1].start_index : -1;
+                node->child_start = node->child_end = node->rover =
+                    (level == CPUINFO_LVL_PROC)
+                    ? cpu : new_tree->level[level + 1].start_index;
+                prev_id[level] = node->id;
+                num_cpus[level] = 1;
+        }
+        for (last_cpu = (num_possible_cpus() - 1); last_cpu >= 0; last_cpu--) {
+                if (cpu_online(last_cpu))
+                        break;
+        }
+        while (++cpu <= last_cpu) {
+                if (!cpu_online(cpu))
+                        continue;
+                for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT;
+                     level--) {
+                        id = cpuinfo_id(cpu, level);
+                        if (unlikely(id < 0)) {
+                                kfree(new_tree);
+                                return NULL;
+                        }
+                        if ((id != prev_id[level]) || (cpu == last_cpu)) {
+                                prev_id[level] = id;
+                                node = &new_tree->nodes[level_rover[level]];
+                                node->num_cpus = num_cpus[level];
+                                num_cpus[level] = 1;
+                                if (cpu == last_cpu)
+                                        node->num_cpus++;
+                                /* Connect tree node to parent */
+                                if (level == CPUINFO_LVL_ROOT)
+                                        node->parent_index = -1;
+                                else
+                                        node->parent_index =
+                                            level_rover[level - 1];
+                                if (level == CPUINFO_LVL_PROC) {
+                                        node->child_end =
+                                            (cpu == last_cpu) ? cpu : prev_cpu;
+                                } else {
+                                        node->child_end =
+                                            level_rover[level + 1] - 1;
+                                }
+                                /* Initialize the next node in the same level */
+                                n = ++level_rover[level];
+                                if (n <= new_tree->level[level].end_index) {
+                                        node = &new_tree->nodes[n];
+                                        node->id = id;
+                                        node->level = level;
+                                        /* Connect node to child */
+                                        node->child_start = node->child_end =
+                                        node->rover =
+                                            (level == CPUINFO_LVL_PROC)
+                                            ? cpu : level_rover[level + 1];
+                                }
+                        } else
+                                num_cpus[level]++;
+                }
+                prev_cpu = cpu;
+        }
+        return new_tree;
+}
+static void increment_rover(struct cpuinfo_tree *t, int node_index,
+                            int root_index, const int *rover_inc_table)
+{
+        struct cpuinfo_node *node = &t->nodes[node_index];
+        int top_level, level;
+        top_level = t->nodes[root_index].level;
+        for (level = node->level; level >= top_level; level--) {
+                node->rover++;
+                if (node->rover <= node->child_end)
+                        return;
+                node->rover = node->child_start;
+                /* If parent's rover does not need to be adjusted, stop here. */
+                if ((level == top_level) ||
+                    !(rover_inc_table[level] & ROVER_INC_PARENT_ON_LOOP))
+                        return;
+                node = &t->nodes[node->parent_index];
+        }
+}
+static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
+{
+        const int *rover_inc_table;
+        int level, new_index, index = root_index;
+        switch (sun4v_chip_type) {
+        case SUN4V_CHIP_NIAGARA1:
+        case SUN4V_CHIP_NIAGARA2:
+                rover_inc_table = niagara_iterate_method;
+                break;
+        default:
+                rover_inc_table = generic_iterate_method;
+        }
+        for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX;
+             level++) {
+                new_index = t->nodes[index].rover;
+                if (rover_inc_table[level] & ROVER_INC_ON_VISIT)
+                        increment_rover(t, index, root_index, rover_inc_table);
+                index = new_index;
+        }
+        return index;
+}
+static void _cpu_map_rebuild(void)
+{
+        int i;
+        if (cpuinfo_tree) {
+                kfree(cpuinfo_tree);
+                cpuinfo_tree = NULL;
+        }
+        cpuinfo_tree = build_cpuinfo_tree();
+        if (!cpuinfo_tree)
+                return;
+        /* Build CPU distribution map that spans all online CPUs.  No need
+         * to check if the CPU is online, as that is done when the cpuinfo
+         * tree is being built.
+         */
+        for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++)
+                cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0);
+}
+/* Fallback if the cpuinfo tree could not be built.  CPU mapping is linear
+ * round robin.
+ */
+static int simple_map_to_cpu(unsigned int index)
+{
+        int i, end, cpu_rover;
+        cpu_rover = 0;
+        end = index % num_online_cpus();
+        for (i = 0; i < num_possible_cpus(); i++) {
+                if (cpu_online(cpu_rover)) {
+                        if (cpu_rover >= end)
+                                return cpu_rover;
+                        cpu_rover++;
+                }
+        }
+        /* Impossible, since num_online_cpus() <= num_possible_cpus() */
+        return first_cpu(cpu_online_map);
+}
+static int _map_to_cpu(unsigned int index)
+{
+        struct cpuinfo_node *root_node;
+        if (unlikely(!cpuinfo_tree)) {
+                _cpu_map_rebuild();
+                if (!cpuinfo_tree)
+                        return simple_map_to_cpu(index);
+        }
+        root_node = &cpuinfo_tree->nodes[0];
+#ifdef CONFIG_HOTPLUG_CPU
+        if (unlikely(root_node->num_cpus != num_online_cpus())) {
+                _cpu_map_rebuild();
+                if (!cpuinfo_tree)
+                        return simple_map_to_cpu(index);
+        }
+#endif
+        return cpu_distribution_map[index % root_node->num_cpus];
+}
+int map_to_cpu(unsigned int index)
+{
+        int mapped_cpu;
+        unsigned long flag;
+        spin_lock_irqsave(&cpu_map_lock, flag);
+        mapped_cpu = _map_to_cpu(index);
+#ifdef CONFIG_HOTPLUG_CPU
+        while (unlikely(!cpu_online(mapped_cpu)))
+                mapped_cpu = _map_to_cpu(index);
+#endif
+        spin_unlock_irqrestore(&cpu_map_lock, flag);
+        return mapped_cpu;
+}
+EXPORT_SYMBOL(map_to_cpu);
+void cpu_map_rebuild(void)
+{
+        unsigned long flag;
+        spin_lock_irqsave(&cpu_map_lock, flag);
+        _cpu_map_rebuild();
+        spin_unlock_irqrestore(&cpu_map_lock, flag);
+}
diff --git a/arch/sparc/kernel/cpumap.h b/arch/sparc/kernel/cpumap.h
new file mode 100644
index 000000000000..e639880ab864
--- /dev/null
+++ b/arch/sparc/kernel/cpumap.h
@@ -0,0 +1,16 @@
+#ifndef _CPUMAP_H
+#define _CPUMAP_H
+#ifdef CONFIG_SMP
+extern void cpu_map_rebuild(void);
+extern int  map_to_cpu(unsigned int index);
+#define cpu_map_init() cpu_map_rebuild()
+#else
+#define cpu_map_init() do {} while (0)
+static inline int map_to_cpu(unsigned int index)
+{
+        return raw_smp_processor_id();
+}
+#endif
+#endif
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index e5e78f9cfc95..bd075054942b 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -45,6 +45,7 @@
 #include <asm/cacheflush.h>
 #include "entry.h"
+#include "cpumap.h"
 #define NUM_IVECS       (IMAP_INR + 1)
@@ -256,35 +257,13 @@ static int irq_choose_cpu(unsigned int virt_irq)
        int cpuid;
        cpumask_copy(&mask, irq_desc[virt_irq].affinity);
-        if (cpus_equal(mask, CPU_MASK_ALL)) {
+        if (cpus_equal(mask, cpu_online_map)) {
-                static int irq_rover;
+                cpuid = map_to_cpu(virt_irq);
-                static DEFINE_SPINLOCK(irq_rover_lock);
-                unsigned long flags;
-                /* Round-robin distribution... */
-        do_round_robin:
-                spin_lock_irqsave(&irq_rover_lock, flags);
-                while (!cpu_online(irq_rover)) {
-                        if (++irq_rover >= nr_cpu_ids)
-                                irq_rover = 0;
-                }
-                cpuid = irq_rover;
-                do {
-                        if (++irq_rover >= nr_cpu_ids)
-                                irq_rover = 0;
-                } while (!cpu_online(irq_rover));
-                spin_unlock_irqrestore(&irq_rover_lock, flags);
        } else {
                cpumask_t tmp;
                cpus_and(tmp, cpu_online_map, mask);
+                cpuid = cpus_empty(tmp) ? map_to_cpu(virt_irq) : first_cpu(tmp);
-                if (cpus_empty(tmp))
-                        goto do_round_robin;
-                cpuid = first_cpu(tmp);
        }
        return cpuid;
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 1de47d2169c8..cfb3d06058ff 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -48,6 +48,8 @@
 #include <asm/ldc.h>
 #include <asm/hypervisor.h>
+#include "cpumap.h"
 int sparc64_multi_core __read_mostly;
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
@@ -1314,6 +1316,8 @@ int __cpu_disable(void)
        cpu_clear(cpu, cpu_online_map);
        ipi_call_unlock();
+        cpu_map_rebuild();
        return 0;
 }
author	Hong H. Pham <hong.pham@windriver.com>	2009-06-04 05:10:11 -0400
committer	David S. Miller <davem@davemloft.net>	2009-06-16 07:56:28 -0400
commit	280ff97494e0fef4124bee5c52e39b23a18dd283 (patch)
tree	e906ca3c5e0a6238882d181ab5b01fb3f40ba5df /arch/sparc
parent	4fd78a5f1edf62ab1ca3d23efee4a8a336edb2b6 (diff)

diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 54742e58831c..47029c66b17a 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_SPARC64) += sstate.o
54	obj-$(CONFIG_SPARC64) += mdesc.o	54	obj-$(CONFIG_SPARC64) += mdesc.o
55	obj-$(CONFIG_SPARC64) += pcr.o	55	obj-$(CONFIG_SPARC64) += pcr.o
56	obj-$(CONFIG_SPARC64) += nmi.o	56	obj-$(CONFIG_SPARC64) += nmi.o
		57	obj-$(CONFIG_SPARC64_SMP) += cpumap.o
57		58
58	# sparc32 do not use GENERIC_HARDIRQS but uses the generic devres implementation	59	# sparc32 do not use GENERIC_HARDIRQS but uses the generic devres implementation
59	obj-$(CONFIG_SPARC32) += devres.o	60	obj-$(CONFIG_SPARC32) += devres.o


diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c new file mode 100644 index 000000000000..7430ed080b23 --- /dev/null +++ b/arch/sparc/kernel/cpumap.c
@@ -0,0 +1,431 @@
		1	/* cpumap.c: used for optimizing CPU assignment
		2	*
		3	* Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com>
		4	*/
		5
		6	#include <linux/module.h>
		7	#include <linux/kernel.h>
		8	#include <linux/init.h>
		9	#include <linux/cpumask.h>
		10	#include <linux/spinlock.h>
		11	#include <asm/cpudata.h>
		12	#include "cpumap.h"
		13
		14
		15	enum {
		16	CPUINFO_LVL_ROOT = 0,
		17	CPUINFO_LVL_NODE,
		18	CPUINFO_LVL_CORE,
		19	CPUINFO_LVL_PROC,
		20	CPUINFO_LVL_MAX,
		21	};
		22
		23	enum {
		24	ROVER_NO_OP = 0,
		25	/* Increment rover every time level is visited */
		26	ROVER_INC_ON_VISIT = 1 << 0,
		27	/* Increment parent's rover every time rover wraps around */
		28	ROVER_INC_PARENT_ON_LOOP = 1 << 1,
		29	};
		30
		31	struct cpuinfo_node {
		32	int id;
		33	int level;
		34	int num_cpus; /* Number of CPUs in this hierarchy */
		35	int parent_index;
		36	int child_start; /* Array index of the first child node */
		37	int child_end; /* Array index of the last child node */
		38	int rover; /* Child node iterator */
		39	};
		40
		41	struct cpuinfo_level {
		42	int start_index; /* Index of first node of a level in a cpuinfo tree */
		43	int end_index; /* Index of last node of a level in a cpuinfo tree */
		44	int num_nodes; /* Number of nodes in a level in a cpuinfo tree */
		45	};
		46
		47	struct cpuinfo_tree {
		48	int total_nodes;
		49
		50	/* Offsets into nodes[] for each level of the tree */
		51	struct cpuinfo_level level[CPUINFO_LVL_MAX];
		52	struct cpuinfo_node nodes[0];
		53	};
		54
		55
		56	static struct cpuinfo_tree *cpuinfo_tree;
		57
		58	static u16 cpu_distribution_map[NR_CPUS];
		59	static DEFINE_SPINLOCK(cpu_map_lock);
		60
		61
		62	/* Niagara optimized cpuinfo tree traversal. */
		63	static const int niagara_iterate_method[] = {
		64	[CPUINFO_LVL_ROOT] = ROVER_NO_OP,
		65
		66	/* Strands (or virtual CPUs) within a core may not run concurrently
		67	* on the Niagara, as instruction pipeline(s) are shared. Distribute
		68	* work to strands in different cores first for better concurrency.
		69	* Go to next NUMA node when all cores are used.
		70	*/
		71	[CPUINFO_LVL_NODE] = ROVER_INC_ON_VISIT\|ROVER_INC_PARENT_ON_LOOP,
		72
		73	/* Strands are grouped together by proc_id in cpuinfo_sparc, i.e.
		74	* a proc_id represents an instruction pipeline. Distribute work to
		75	* strands in different proc_id groups if the core has multiple
		76	* instruction pipelines (e.g. the Niagara 2/2+ has two).
		77	*/
		78	[CPUINFO_LVL_CORE] = ROVER_INC_ON_VISIT,
		79
		80	/* Pick the next strand in the proc_id group. */
		81	[CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT,
		82	};
		83
		84	/* Generic cpuinfo tree traversal. Distribute work round robin across NUMA
		85	* nodes.
		86	*/
		87	static const int generic_iterate_method[] = {
		88	[CPUINFO_LVL_ROOT] = ROVER_INC_ON_VISIT,
		89	[CPUINFO_LVL_NODE] = ROVER_NO_OP,
		90	[CPUINFO_LVL_CORE] = ROVER_INC_PARENT_ON_LOOP,
		91	[CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT\|ROVER_INC_PARENT_ON_LOOP,
		92	};
		93
		94
		95	static int cpuinfo_id(int cpu, int level)
		96	{
		97	int id;
		98
		99	switch (level) {
		100	case CPUINFO_LVL_ROOT:
		101	id = 0;
		102	break;
		103	case CPUINFO_LVL_NODE:
		104	id = cpu_to_node(cpu);
		105	break;
		106	case CPUINFO_LVL_CORE:
		107	id = cpu_data(cpu).core_id;
		108	break;
		109	case CPUINFO_LVL_PROC:
		110	id = cpu_data(cpu).proc_id;
		111	break;
		112	default:
		113	id = -EINVAL;
		114	}
		115	return id;
		116	}
		117
		118	/*
		119	* Enumerate the CPU information in __cpu_data to determine the start index,
		120	* end index, and number of nodes for each level in the cpuinfo tree. The
		121	* total number of cpuinfo nodes required to build the tree is returned.
		122	*/
		123	static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
		124	{
		125	int prev_id[CPUINFO_LVL_MAX];
		126	int i, n, num_nodes;
		127
		128	for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) {
		129	struct cpuinfo_level *lv = &tree_level[i];
		130
		131	prev_id[i] = -1;
		132	lv->start_index = lv->end_index = lv->num_nodes = 0;
		133	}
		134
		135	num_nodes = 1; /* Include the root node */
		136
		137	for (i = 0; i < num_possible_cpus(); i++) {
		138	if (!cpu_online(i))
		139	continue;
		140
		141	n = cpuinfo_id(i, CPUINFO_LVL_NODE);
		142	if (n > prev_id[CPUINFO_LVL_NODE]) {
		143	tree_level[CPUINFO_LVL_NODE].num_nodes++;
		144	prev_id[CPUINFO_LVL_NODE] = n;
		145	num_nodes++;
		146	}
		147	n = cpuinfo_id(i, CPUINFO_LVL_CORE);
		148	if (n > prev_id[CPUINFO_LVL_CORE]) {
		149	tree_level[CPUINFO_LVL_CORE].num_nodes++;
		150	prev_id[CPUINFO_LVL_CORE] = n;
		151	num_nodes++;
		152	}
		153	n = cpuinfo_id(i, CPUINFO_LVL_PROC);
		154	if (n > prev_id[CPUINFO_LVL_PROC]) {
		155	tree_level[CPUINFO_LVL_PROC].num_nodes++;
		156	prev_id[CPUINFO_LVL_PROC] = n;
		157	num_nodes++;
		158	}
		159	}
		160
		161	tree_level[CPUINFO_LVL_ROOT].num_nodes = 1;
		162
		163	n = tree_level[CPUINFO_LVL_NODE].num_nodes;
		164	tree_level[CPUINFO_LVL_NODE].start_index = 1;
		165	tree_level[CPUINFO_LVL_NODE].end_index = n;
		166
		167	n++;
		168	tree_level[CPUINFO_LVL_CORE].start_index = n;
		169	n += tree_level[CPUINFO_LVL_CORE].num_nodes;
		170	tree_level[CPUINFO_LVL_CORE].end_index = n - 1;
		171
		172	tree_level[CPUINFO_LVL_PROC].start_index = n;
		173	n += tree_level[CPUINFO_LVL_PROC].num_nodes;
		174	tree_level[CPUINFO_LVL_PROC].end_index = n - 1;
		175
		176	return num_nodes;
		177	}
		178
		179	/* Build a tree representation of the CPU hierarchy using the per CPU
		180	* information in __cpu_data. Entries in __cpu_data[0..NR_CPUS] are
		181	* assumed to be sorted in ascending order based on node, core_id, and
		182	* proc_id (in order of significance).
		183	*/
		184	static struct cpuinfo_tree *build_cpuinfo_tree(void)
		185	{
		186	struct cpuinfo_tree *new_tree;
		187	struct cpuinfo_node *node;
		188	struct cpuinfo_level tmp_level[CPUINFO_LVL_MAX];
		189	int num_cpus[CPUINFO_LVL_MAX];
		190	int level_rover[CPUINFO_LVL_MAX];
		191	int prev_id[CPUINFO_LVL_MAX];
		192	int n, id, cpu, prev_cpu, last_cpu, level;
		193
		194	n = enumerate_cpuinfo_nodes(tmp_level);
		195
		196	new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
		197	(sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
		198	if (!new_tree)
		199	return NULL;
		200
		201	new_tree->total_nodes = n;
		202	memcpy(&new_tree->level, tmp_level, sizeof(tmp_level));
		203
		204	prev_cpu = cpu = first_cpu(cpu_online_map);
		205
		206	/* Initialize all levels in the tree with the first CPU */
		207	for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) {
		208	n = new_tree->level[level].start_index;
		209
		210	level_rover[level] = n;
		211	node = &new_tree->nodes[n];
		212
		213	id = cpuinfo_id(cpu, level);
		214	if (unlikely(id < 0)) {
		215	kfree(new_tree);
		216	return NULL;
		217	}
		218	node->id = id;
		219	node->level = level;
		220	node->num_cpus = 1;
		221
		222	node->parent_index = (level > CPUINFO_LVL_ROOT)
		223	? new_tree->level[level - 1].start_index : -1;
		224
		225	node->child_start = node->child_end = node->rover =
		226	(level == CPUINFO_LVL_PROC)
		227	? cpu : new_tree->level[level + 1].start_index;
		228
		229	prev_id[level] = node->id;
		230	num_cpus[level] = 1;
		231	}
		232
		233	for (last_cpu = (num_possible_cpus() - 1); last_cpu >= 0; last_cpu--) {
		234	if (cpu_online(last_cpu))
		235	break;
		236	}
		237
		238	while (++cpu <= last_cpu) {
		239	if (!cpu_online(cpu))
		240	continue;
		241
		242	for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT;
		243	level--) {
		244	id = cpuinfo_id(cpu, level);
		245	if (unlikely(id < 0)) {
		246	kfree(new_tree);
		247	return NULL;
		248	}
		249
		250	if ((id != prev_id[level]) \|\| (cpu == last_cpu)) {
		251	prev_id[level] = id;
		252	node = &new_tree->nodes[level_rover[level]];
		253	node->num_cpus = num_cpus[level];
		254	num_cpus[level] = 1;
		255
		256	if (cpu == last_cpu)
		257	node->num_cpus++;
		258
		259	/* Connect tree node to parent */
		260	if (level == CPUINFO_LVL_ROOT)
		261	node->parent_index = -1;
		262	else
		263	node->parent_index =
		264	level_rover[level - 1];
		265
		266	if (level == CPUINFO_LVL_PROC) {
		267	node->child_end =
		268	(cpu == last_cpu) ? cpu : prev_cpu;
		269	} else {
		270	node->child_end =
		271	level_rover[level + 1] - 1;
		272	}
		273
		274	/* Initialize the next node in the same level */
		275	n = ++level_rover[level];
		276	if (n <= new_tree->level[level].end_index) {
		277	node = &new_tree->nodes[n];
		278	node->id = id;
		279	node->level = level;
		280
		281	/* Connect node to child */
		282	node->child_start = node->child_end =
		283	node->rover =
		284	(level == CPUINFO_LVL_PROC)
		285	? cpu : level_rover[level + 1];
		286	}
		287	} else
		288	num_cpus[level]++;
		289	}
		290	prev_cpu = cpu;
		291	}
		292
		293	return new_tree;
		294	}
		295
		296	static void increment_rover(struct cpuinfo_tree *t, int node_index,
		297	int root_index, const int *rover_inc_table)
		298	{
		299	struct cpuinfo_node *node = &t->nodes[node_index];
		300	int top_level, level;
		301
		302	top_level = t->nodes[root_index].level;
		303	for (level = node->level; level >= top_level; level--) {
		304	node->rover++;
		305	if (node->rover <= node->child_end)
		306	return;
		307
		308	node->rover = node->child_start;
		309	/* If parent's rover does not need to be adjusted, stop here. */
		310	if ((level == top_level) \|\|
		311	!(rover_inc_table[level] & ROVER_INC_PARENT_ON_LOOP))
		312	return;
		313
		314	node = &t->nodes[node->parent_index];
		315	}
		316	}
		317
		318	static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
		319	{
		320	const int *rover_inc_table;
		321	int level, new_index, index = root_index;
		322
		323	switch (sun4v_chip_type) {
		324	case SUN4V_CHIP_NIAGARA1:
		325	case SUN4V_CHIP_NIAGARA2:
		326	rover_inc_table = niagara_iterate_method;
		327	break;
		328	default:
		329	rover_inc_table = generic_iterate_method;
		330	}
		331
		332	for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX;
		333	level++) {
		334	new_index = t->nodes[index].rover;
		335	if (rover_inc_table[level] & ROVER_INC_ON_VISIT)
		336	increment_rover(t, index, root_index, rover_inc_table);
		337
		338	index = new_index;
		339	}
		340	return index;
		341	}
		342
		343	static void _cpu_map_rebuild(void)
		344	{
		345	int i;
		346
		347	if (cpuinfo_tree) {
		348	kfree(cpuinfo_tree);
		349	cpuinfo_tree = NULL;
		350	}
		351
		352	cpuinfo_tree = build_cpuinfo_tree();
		353	if (!cpuinfo_tree)
		354	return;
		355
		356	/* Build CPU distribution map that spans all online CPUs. No need
		357	* to check if the CPU is online, as that is done when the cpuinfo
		358	* tree is being built.
		359	*/
		360	for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++)
		361	cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0);
		362	}
		363
		364	/* Fallback if the cpuinfo tree could not be built. CPU mapping is linear
		365	* round robin.
		366	*/
		367	static int simple_map_to_cpu(unsigned int index)
		368	{
		369	int i, end, cpu_rover;
		370
		371	cpu_rover = 0;
		372	end = index % num_online_cpus();
		373	for (i = 0; i < num_possible_cpus(); i++) {
		374	if (cpu_online(cpu_rover)) {
		375	if (cpu_rover >= end)
		376	return cpu_rover;
		377
		378	cpu_rover++;
		379	}
		380	}
		381
		382	/* Impossible, since num_online_cpus() <= num_possible_cpus() */
		383	return first_cpu(cpu_online_map);
		384	}
		385
		386	static int _map_to_cpu(unsigned int index)
		387	{
		388	struct cpuinfo_node *root_node;
		389
		390	if (unlikely(!cpuinfo_tree)) {
		391	_cpu_map_rebuild();
		392	if (!cpuinfo_tree)
		393	return simple_map_to_cpu(index);
		394	}
		395
		396	root_node = &cpuinfo_tree->nodes[0];
		397	#ifdef CONFIG_HOTPLUG_CPU
		398	if (unlikely(root_node->num_cpus != num_online_cpus())) {
		399	_cpu_map_rebuild();
		400	if (!cpuinfo_tree)
		401	return simple_map_to_cpu(index);
		402	}
		403	#endif
		404	return cpu_distribution_map[index % root_node->num_cpus];
		405	}
		406
		407	int map_to_cpu(unsigned int index)
		408	{
		409	int mapped_cpu;
		410	unsigned long flag;
		411
		412	spin_lock_irqsave(&cpu_map_lock, flag);
		413	mapped_cpu = _map_to_cpu(index);
		414
		415	#ifdef CONFIG_HOTPLUG_CPU
		416	while (unlikely(!cpu_online(mapped_cpu)))
		417	mapped_cpu = _map_to_cpu(index);
		418	#endif
		419	spin_unlock_irqrestore(&cpu_map_lock, flag);
		420	return mapped_cpu;
		421	}
		422	EXPORT_SYMBOL(map_to_cpu);
		423
		424	void cpu_map_rebuild(void)
		425	{
		426	unsigned long flag;
		427
		428	spin_lock_irqsave(&cpu_map_lock, flag);
		429	_cpu_map_rebuild();
		430	spin_unlock_irqrestore(&cpu_map_lock, flag);
		431	}


diff --git a/arch/sparc/kernel/cpumap.h b/arch/sparc/kernel/cpumap.h new file mode 100644 index 000000000000..e639880ab864 --- /dev/null +++ b/arch/sparc/kernel/cpumap.h
@@ -0,0 +1,16 @@
		1	#ifndef _CPUMAP_H
		2	#define _CPUMAP_H
		3
		4	#ifdef CONFIG_SMP
		5	extern void cpu_map_rebuild(void);
		6	extern int map_to_cpu(unsigned int index);
		7	#define cpu_map_init() cpu_map_rebuild()
		8	#else
		9	#define cpu_map_init() do {} while (0)
		10	static inline int map_to_cpu(unsigned int index)
		11	{
		12	return raw_smp_processor_id();
		13	}
		14	#endif
		15
		16	#endif


diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index e5e78f9cfc95..bd075054942b 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c
@@ -45,6 +45,7 @@
45	#include <asm/cacheflush.h>	45	#include <asm/cacheflush.h>
46		46
47	#include "entry.h"	47	#include "entry.h"
		48	#include "cpumap.h"
48		49
49	#define NUM_IVECS (IMAP_INR + 1)	50	#define NUM_IVECS (IMAP_INR + 1)
50		51
@@ -256,35 +257,13 @@ static int irq_choose_cpu(unsigned int virt_irq)
256	int cpuid;	257	int cpuid;
257		258
258	cpumask_copy(&mask, irq_desc[virt_irq].affinity);	259	cpumask_copy(&mask, irq_desc[virt_irq].affinity);
259	if (cpus_equal(mask, CPU_MASK_ALL)) {	260	if (cpus_equal(mask, cpu_online_map)) {
260	static int irq_rover;	261	cpuid = map_to_cpu(virt_irq);
261	static DEFINE_SPINLOCK(irq_rover_lock);
262	unsigned long flags;
263
264	/* Round-robin distribution... */
265	do_round_robin:
266	spin_lock_irqsave(&irq_rover_lock, flags);
267
268	while (!cpu_online(irq_rover)) {
269	if (++irq_rover >= nr_cpu_ids)
270	irq_rover = 0;
271	}
272	cpuid = irq_rover;
273	do {
274	if (++irq_rover >= nr_cpu_ids)
275	irq_rover = 0;
276	} while (!cpu_online(irq_rover));
277
278	spin_unlock_irqrestore(&irq_rover_lock, flags);
279	} else {	262	} else {
280	cpumask_t tmp;	263	cpumask_t tmp;
281		264
282	cpus_and(tmp, cpu_online_map, mask);	265	cpus_and(tmp, cpu_online_map, mask);
283		266	cpuid = cpus_empty(tmp) ? map_to_cpu(virt_irq) : first_cpu(tmp);
284	if (cpus_empty(tmp))
285	goto do_round_robin;
286
287	cpuid = first_cpu(tmp);
288	}	267	}
289		268
290	return cpuid;	269	return cpuid;


diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 1de47d2169c8..cfb3d06058ff 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c
@@ -48,6 +48,8 @@
48	#include <asm/ldc.h>	48	#include <asm/ldc.h>
49	#include <asm/hypervisor.h>	49	#include <asm/hypervisor.h>
50		50
		51	#include "cpumap.h"
		52
51	int sparc64_multi_core __read_mostly;	53	int sparc64_multi_core __read_mostly;
52		54
53	DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;	55	DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
@@ -1314,6 +1316,8 @@ int __cpu_disable(void)
1314	cpu_clear(cpu, cpu_online_map);	1316	cpu_clear(cpu, cpu_online_map);
1315	ipi_call_unlock();	1317	ipi_call_unlock();
1316		1318
		1319	cpu_map_rebuild();
		1320
1317	return 0;	1321	return 0;
1318	}	1322	}
1319		1323