sparc64: fix and optimize irq distribution

irq_choose_cpu() should compare the affinity mask against cpu_online_map rather than CPU_MASK_ALL, since irq_select_affinity() sets the interrupt's affinity mask to cpu_online_map "and" CPU_MASK_ALL (which ends up being just cpu_online_map). The mask comparison in irq_choose_cpu() will always fail since the two masks are not the same. So the CPU chosen is the first CPU in the intersection of cpu_online_map and CPU_MASK_ALL, which is always CPU0. That means all interrupts are reassigned to CPU0... Distributing interrupts to CPUs in a linearly increasing round robin fashion is not optimal for the UltraSPARC T1/T2. Also, the irq_rover in irq_choose_cpu() causes an interrupt to be assigned to a different processor each time the interrupt is allocated and released. This may lead to an unbalanced distribution over time. A static mapping of interrupts to processors is done to optimize and balance interrupt distribution. For the T1/T2, interrupts are spread to different cores first, and then to strands within a core. The following is some benchmarks showing the effects of interrupt distribution on a T2. The test was done with iperf using a pair of T5220 boxes, each with a 10GBe NIU (XAUI) connected back to back. TCP | Stock Linear RR IRQ Optimized IRQ Streams | 2.6.30-rc5 Distribution Distribution | GBits/sec GBits/sec GBits/sec --------+----------------------------------------- 1 0.839 0.862 0.868 8 1.16 4.96 5.88 16 1.15 6.40 8.04 100 1.09 7.28 8.68 Signed-off-by: Hong H. Pham <hong.pham@windriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Hong H. Pham <hong.pham@windriver.com> 2009-06-04 05:10:11 -0400
committer: David S. Miller <davem@davemloft.net> 2009-06-16 07:56:28 -0400
commit: 280ff97494e0fef4124bee5c52e39b23a18dd283 (patch)
tree: e906ca3c5e0a6238882d181ab5b01fb3f40ba5df /arch/sparc/kernel/cpumap.c
parent: 4fd78a5f1edf62ab1ca3d23efee4a8a336edb2b6 (diff)
1 files changed, 431 insertions, 0 deletions
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c
new file mode 100644
index 000000000000..7430ed080b23
--- /dev/null
+++ b/arch/sparc/kernel/cpumap.c
@@ -0,0 +1,431 @@
+/* cpumap.c: used for optimizing CPU assignment
+ *
+ * Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com>
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <asm/cpudata.h>
+#include "cpumap.h"
+enum {
+        CPUINFO_LVL_ROOT = 0,
+        CPUINFO_LVL_NODE,
+        CPUINFO_LVL_CORE,
+        CPUINFO_LVL_PROC,
+        CPUINFO_LVL_MAX,
+};
+enum {
+        ROVER_NO_OP              = 0,
+        /* Increment rover every time level is visited */
+        ROVER_INC_ON_VISIT       = 1 << 0,
+        /* Increment parent's rover every time rover wraps around */
+        ROVER_INC_PARENT_ON_LOOP = 1 << 1,
+};
+struct cpuinfo_node {
+        int id;
+        int level;
+        int num_cpus;    /* Number of CPUs in this hierarchy */
+        int parent_index;
+        int child_start; /* Array index of the first child node */
+        int child_end;   /* Array index of the last child node */
+        int rover;       /* Child node iterator */
+};
+struct cpuinfo_level {
+        int start_index; /* Index of first node of a level in a cpuinfo tree */
+        int end_index;   /* Index of last node of a level in a cpuinfo tree */
+        int num_nodes;   /* Number of nodes in a level in a cpuinfo tree */
+};
+struct cpuinfo_tree {
+        int total_nodes;
+        /* Offsets into nodes[] for each level of the tree */
+        struct cpuinfo_level level[CPUINFO_LVL_MAX];
+        struct cpuinfo_node  nodes[0];
+};
+static struct cpuinfo_tree *cpuinfo_tree;
+static u16 cpu_distribution_map[NR_CPUS];
+static DEFINE_SPINLOCK(cpu_map_lock);
+/* Niagara optimized cpuinfo tree traversal. */
+static const int niagara_iterate_method[] = {
+        [CPUINFO_LVL_ROOT] = ROVER_NO_OP,
+        /* Strands (or virtual CPUs) within a core may not run concurrently
+         * on the Niagara, as instruction pipeline(s) are shared.  Distribute
+         * work to strands in different cores first for better concurrency.
+         * Go to next NUMA node when all cores are used.
+         */
+        [CPUINFO_LVL_NODE] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP,
+        /* Strands are grouped together by proc_id in cpuinfo_sparc, i.e.
+         * a proc_id represents an instruction pipeline.  Distribute work to
+         * strands in different proc_id groups if the core has multiple
+         * instruction pipelines (e.g. the Niagara 2/2+ has two).
+         */
+        [CPUINFO_LVL_CORE] = ROVER_INC_ON_VISIT,
+        /* Pick the next strand in the proc_id group. */
+        [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT,
+};
+/* Generic cpuinfo tree traversal.  Distribute work round robin across NUMA
+ * nodes.
+ */
+static const int generic_iterate_method[] = {
+        [CPUINFO_LVL_ROOT] = ROVER_INC_ON_VISIT,
+        [CPUINFO_LVL_NODE] = ROVER_NO_OP,
+        [CPUINFO_LVL_CORE] = ROVER_INC_PARENT_ON_LOOP,
+        [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP,
+};
+static int cpuinfo_id(int cpu, int level)
+{
+        int id;
+        switch (level) {
+        case CPUINFO_LVL_ROOT:
+                id = 0;
+                break;
+        case CPUINFO_LVL_NODE:
+                id = cpu_to_node(cpu);
+                break;
+        case CPUINFO_LVL_CORE:
+                id = cpu_data(cpu).core_id;
+                break;
+        case CPUINFO_LVL_PROC:
+                id = cpu_data(cpu).proc_id;
+                break;
+        default:
+                id = -EINVAL;
+        }
+        return id;
+}
+/*
+ * Enumerate the CPU information in __cpu_data to determine the start index,
+ * end index, and number of nodes for each level in the cpuinfo tree.  The
+ * total number of cpuinfo nodes required to build the tree is returned.
+ */
+static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
+{
+        int prev_id[CPUINFO_LVL_MAX];
+        int i, n, num_nodes;
+        for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) {
+                struct cpuinfo_level *lv = &tree_level[i];
+                prev_id[i] = -1;
+                lv->start_index = lv->end_index = lv->num_nodes = 0;
+        }
+        num_nodes = 1; /* Include the root node */
+        for (i = 0; i < num_possible_cpus(); i++) {
+                if (!cpu_online(i))
+                        continue;
+                n = cpuinfo_id(i, CPUINFO_LVL_NODE);
+                if (n > prev_id[CPUINFO_LVL_NODE]) {
+                        tree_level[CPUINFO_LVL_NODE].num_nodes++;
+                        prev_id[CPUINFO_LVL_NODE] = n;
+                        num_nodes++;
+                }
+                n = cpuinfo_id(i, CPUINFO_LVL_CORE);
+                if (n > prev_id[CPUINFO_LVL_CORE]) {
+                        tree_level[CPUINFO_LVL_CORE].num_nodes++;
+                        prev_id[CPUINFO_LVL_CORE] = n;
+                        num_nodes++;
+                }
+                n = cpuinfo_id(i, CPUINFO_LVL_PROC);
+                if (n > prev_id[CPUINFO_LVL_PROC]) {
+                        tree_level[CPUINFO_LVL_PROC].num_nodes++;
+                        prev_id[CPUINFO_LVL_PROC] = n;
+                        num_nodes++;
+                }
+        }
+        tree_level[CPUINFO_LVL_ROOT].num_nodes = 1;
+        n = tree_level[CPUINFO_LVL_NODE].num_nodes;
+        tree_level[CPUINFO_LVL_NODE].start_index = 1;
+        tree_level[CPUINFO_LVL_NODE].end_index   = n;
+        n++;
+        tree_level[CPUINFO_LVL_CORE].start_index = n;
+        n += tree_level[CPUINFO_LVL_CORE].num_nodes;
+        tree_level[CPUINFO_LVL_CORE].end_index   = n - 1;
+        tree_level[CPUINFO_LVL_PROC].start_index = n;
+        n += tree_level[CPUINFO_LVL_PROC].num_nodes;
+        tree_level[CPUINFO_LVL_PROC].end_index   = n - 1;
+        return num_nodes;
+}
+/* Build a tree representation of the CPU hierarchy using the per CPU
+ * information in __cpu_data.  Entries in __cpu_data[0..NR_CPUS] are
+ * assumed to be sorted in ascending order based on node, core_id, and
+ * proc_id (in order of significance).
+ */
+static struct cpuinfo_tree *build_cpuinfo_tree(void)
+{
+        struct cpuinfo_tree *new_tree;
+        struct cpuinfo_node *node;
+        struct cpuinfo_level tmp_level[CPUINFO_LVL_MAX];
+        int num_cpus[CPUINFO_LVL_MAX];
+        int level_rover[CPUINFO_LVL_MAX];
+        int prev_id[CPUINFO_LVL_MAX];
+        int n, id, cpu, prev_cpu, last_cpu, level;
+        n = enumerate_cpuinfo_nodes(tmp_level);
+        new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
+                           (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
+        if (!new_tree)
+                return NULL;
+        new_tree->total_nodes = n;
+        memcpy(&new_tree->level, tmp_level, sizeof(tmp_level));
+        prev_cpu = cpu = first_cpu(cpu_online_map);
+        /* Initialize all levels in the tree with the first CPU */
+        for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) {
+                n = new_tree->level[level].start_index;
+                level_rover[level] = n;
+                node = &new_tree->nodes[n];
+                id = cpuinfo_id(cpu, level);
+                if (unlikely(id < 0)) {
+                        kfree(new_tree);
+                        return NULL;
+                }
+                node->id = id;
+                node->level = level;
+                node->num_cpus = 1;
+                node->parent_index = (level > CPUINFO_LVL_ROOT)
+                    ? new_tree->level[level - 1].start_index : -1;
+                node->child_start = node->child_end = node->rover =
+                    (level == CPUINFO_LVL_PROC)
+                    ? cpu : new_tree->level[level + 1].start_index;
+                prev_id[level] = node->id;
+                num_cpus[level] = 1;
+        }
+        for (last_cpu = (num_possible_cpus() - 1); last_cpu >= 0; last_cpu--) {
+                if (cpu_online(last_cpu))
+                        break;
+        }
+        while (++cpu <= last_cpu) {
+                if (!cpu_online(cpu))
+                        continue;
+                for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT;
+                     level--) {
+                        id = cpuinfo_id(cpu, level);
+                        if (unlikely(id < 0)) {
+                                kfree(new_tree);
+                                return NULL;
+                        }
+                        if ((id != prev_id[level]) || (cpu == last_cpu)) {
+                                prev_id[level] = id;
+                                node = &new_tree->nodes[level_rover[level]];
+                                node->num_cpus = num_cpus[level];
+                                num_cpus[level] = 1;
+                                if (cpu == last_cpu)
+                                        node->num_cpus++;
+                                /* Connect tree node to parent */
+                                if (level == CPUINFO_LVL_ROOT)
+                                        node->parent_index = -1;
+                                else
+                                        node->parent_index =
+                                            level_rover[level - 1];
+                                if (level == CPUINFO_LVL_PROC) {
+                                        node->child_end =
+                                            (cpu == last_cpu) ? cpu : prev_cpu;
+                                } else {
+                                        node->child_end =
+                                            level_rover[level + 1] - 1;
+                                }
+                                /* Initialize the next node in the same level */
+                                n = ++level_rover[level];
+                                if (n <= new_tree->level[level].end_index) {
+                                        node = &new_tree->nodes[n];
+                                        node->id = id;
+                                        node->level = level;
+                                        /* Connect node to child */
+                                        node->child_start = node->child_end =
+                                        node->rover =
+                                            (level == CPUINFO_LVL_PROC)
+                                            ? cpu : level_rover[level + 1];
+                                }
+                        } else
+                                num_cpus[level]++;
+                }
+                prev_cpu = cpu;
+        }
+        return new_tree;
+}
+static void increment_rover(struct cpuinfo_tree *t, int node_index,
+                            int root_index, const int *rover_inc_table)
+{
+        struct cpuinfo_node *node = &t->nodes[node_index];
+        int top_level, level;
+        top_level = t->nodes[root_index].level;
+        for (level = node->level; level >= top_level; level--) {
+                node->rover++;
+                if (node->rover <= node->child_end)
+                        return;
+                node->rover = node->child_start;
+                /* If parent's rover does not need to be adjusted, stop here. */
+                if ((level == top_level) ||
+                    !(rover_inc_table[level] & ROVER_INC_PARENT_ON_LOOP))
+                        return;
+                node = &t->nodes[node->parent_index];
+        }
+}
+static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
+{
+        const int *rover_inc_table;
+        int level, new_index, index = root_index;
+        switch (sun4v_chip_type) {
+        case SUN4V_CHIP_NIAGARA1:
+        case SUN4V_CHIP_NIAGARA2:
+                rover_inc_table = niagara_iterate_method;
+                break;
+        default:
+                rover_inc_table = generic_iterate_method;
+        }
+        for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX;
+             level++) {
+                new_index = t->nodes[index].rover;
+                if (rover_inc_table[level] & ROVER_INC_ON_VISIT)
+                        increment_rover(t, index, root_index, rover_inc_table);
+                index = new_index;
+        }
+        return index;
+}
+static void _cpu_map_rebuild(void)
+{
+        int i;
+        if (cpuinfo_tree) {
+                kfree(cpuinfo_tree);
+                cpuinfo_tree = NULL;
+        }
+        cpuinfo_tree = build_cpuinfo_tree();
+        if (!cpuinfo_tree)
+                return;
+        /* Build CPU distribution map that spans all online CPUs.  No need
+         * to check if the CPU is online, as that is done when the cpuinfo
+         * tree is being built.
+         */
+        for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++)
+                cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0);
+}
+/* Fallback if the cpuinfo tree could not be built.  CPU mapping is linear
+ * round robin.
+ */
+static int simple_map_to_cpu(unsigned int index)
+{
+        int i, end, cpu_rover;
+        cpu_rover = 0;
+        end = index % num_online_cpus();
+        for (i = 0; i < num_possible_cpus(); i++) {
+                if (cpu_online(cpu_rover)) {
+                        if (cpu_rover >= end)
+                                return cpu_rover;
+                        cpu_rover++;
+                }
+        }
+        /* Impossible, since num_online_cpus() <= num_possible_cpus() */
+        return first_cpu(cpu_online_map);
+}
+static int _map_to_cpu(unsigned int index)
+{
+        struct cpuinfo_node *root_node;
+        if (unlikely(!cpuinfo_tree)) {
+                _cpu_map_rebuild();
+                if (!cpuinfo_tree)
+                        return simple_map_to_cpu(index);
+        }
+        root_node = &cpuinfo_tree->nodes[0];
+#ifdef CONFIG_HOTPLUG_CPU
+        if (unlikely(root_node->num_cpus != num_online_cpus())) {
+                _cpu_map_rebuild();
+                if (!cpuinfo_tree)
+                        return simple_map_to_cpu(index);
+        }
+#endif
+        return cpu_distribution_map[index % root_node->num_cpus];
+}
+int map_to_cpu(unsigned int index)
+{
+        int mapped_cpu;
+        unsigned long flag;
+        spin_lock_irqsave(&cpu_map_lock, flag);
+        mapped_cpu = _map_to_cpu(index);
+#ifdef CONFIG_HOTPLUG_CPU
+        while (unlikely(!cpu_online(mapped_cpu)))
+                mapped_cpu = _map_to_cpu(index);
+#endif
+        spin_unlock_irqrestore(&cpu_map_lock, flag);
+        return mapped_cpu;
+}
+EXPORT_SYMBOL(map_to_cpu);
+void cpu_map_rebuild(void)
+{
+        unsigned long flag;
+        spin_lock_irqsave(&cpu_map_lock, flag);
+        _cpu_map_rebuild();
+        spin_unlock_irqrestore(&cpu_map_lock, flag);
+}
author	Hong H. Pham <hong.pham@windriver.com>	2009-06-04 05:10:11 -0400
committer	David S. Miller <davem@davemloft.net>	2009-06-16 07:56:28 -0400
commit	280ff97494e0fef4124bee5c52e39b23a18dd283 (patch)
tree	e906ca3c5e0a6238882d181ab5b01fb3f40ba5df /arch/sparc/kernel/cpumap.c
parent	4fd78a5f1edf62ab1ca3d23efee4a8a336edb2b6 (diff)

diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c new file mode 100644 index 000000000000..7430ed080b23 --- /dev/null +++ b/arch/sparc/kernel/cpumap.c
@@ -0,0 +1,431 @@
	1	/* cpumap.c: used for optimizing CPU assignment
	2	*
	3	* Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com>
	4	*/
	5
	6	#include <linux/module.h>
	7	#include <linux/kernel.h>
	8	#include <linux/init.h>
	9	#include <linux/cpumask.h>
	10	#include <linux/spinlock.h>
	11	#include <asm/cpudata.h>
	12	#include "cpumap.h"
	13
	14
	15	enum {
	16	CPUINFO_LVL_ROOT = 0,
	17	CPUINFO_LVL_NODE,
	18	CPUINFO_LVL_CORE,
	19	CPUINFO_LVL_PROC,
	20	CPUINFO_LVL_MAX,
	21	};
	22
	23	enum {
	24	ROVER_NO_OP = 0,
	25	/* Increment rover every time level is visited */
	26	ROVER_INC_ON_VISIT = 1 << 0,
	27	/* Increment parent's rover every time rover wraps around */
	28	ROVER_INC_PARENT_ON_LOOP = 1 << 1,
	29	};
	30
	31	struct cpuinfo_node {
	32	int id;
	33	int level;
	34	int num_cpus; /* Number of CPUs in this hierarchy */
	35	int parent_index;
	36	int child_start; /* Array index of the first child node */
	37	int child_end; /* Array index of the last child node */
	38	int rover; /* Child node iterator */
	39	};
	40
	41	struct cpuinfo_level {
	42	int start_index; /* Index of first node of a level in a cpuinfo tree */
	43	int end_index; /* Index of last node of a level in a cpuinfo tree */
	44	int num_nodes; /* Number of nodes in a level in a cpuinfo tree */
	45	};
	46
	47	struct cpuinfo_tree {
	48	int total_nodes;
	49
	50	/* Offsets into nodes[] for each level of the tree */
	51	struct cpuinfo_level level[CPUINFO_LVL_MAX];
	52	struct cpuinfo_node nodes[0];
	53	};
	54
	55
	56	static struct cpuinfo_tree *cpuinfo_tree;
	57
	58	static u16 cpu_distribution_map[NR_CPUS];
	59	static DEFINE_SPINLOCK(cpu_map_lock);
	60
	61
	62	/* Niagara optimized cpuinfo tree traversal. */
	63	static const int niagara_iterate_method[] = {
	64	[CPUINFO_LVL_ROOT] = ROVER_NO_OP,
	65
	66	/* Strands (or virtual CPUs) within a core may not run concurrently
	67	* on the Niagara, as instruction pipeline(s) are shared. Distribute
	68	* work to strands in different cores first for better concurrency.
	69	* Go to next NUMA node when all cores are used.
	70	*/
	71	[CPUINFO_LVL_NODE] = ROVER_INC_ON_VISIT\|ROVER_INC_PARENT_ON_LOOP,
	72
	73	/* Strands are grouped together by proc_id in cpuinfo_sparc, i.e.
	74	* a proc_id represents an instruction pipeline. Distribute work to
	75	* strands in different proc_id groups if the core has multiple
	76	* instruction pipelines (e.g. the Niagara 2/2+ has two).
	77	*/
	78	[CPUINFO_LVL_CORE] = ROVER_INC_ON_VISIT,
	79
	80	/* Pick the next strand in the proc_id group. */
	81	[CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT,
	82	};
	83
	84	/* Generic cpuinfo tree traversal. Distribute work round robin across NUMA
	85	* nodes.
	86	*/
	87	static const int generic_iterate_method[] = {
	88	[CPUINFO_LVL_ROOT] = ROVER_INC_ON_VISIT,
	89	[CPUINFO_LVL_NODE] = ROVER_NO_OP,
	90	[CPUINFO_LVL_CORE] = ROVER_INC_PARENT_ON_LOOP,
	91	[CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT\|ROVER_INC_PARENT_ON_LOOP,
	92	};
	93
	94
	95	static int cpuinfo_id(int cpu, int level)
	96	{
	97	int id;
	98
	99	switch (level) {
	100	case CPUINFO_LVL_ROOT:
	101	id = 0;
	102	break;
	103	case CPUINFO_LVL_NODE:
	104	id = cpu_to_node(cpu);
	105	break;
	106	case CPUINFO_LVL_CORE:
	107	id = cpu_data(cpu).core_id;
	108	break;
	109	case CPUINFO_LVL_PROC:
	110	id = cpu_data(cpu).proc_id;
	111	break;
	112	default:
	113	id = -EINVAL;
	114	}
	115	return id;
	116	}
	117
	118	/*
	119	* Enumerate the CPU information in __cpu_data to determine the start index,
	120	* end index, and number of nodes for each level in the cpuinfo tree. The
	121	* total number of cpuinfo nodes required to build the tree is returned.
	122	*/
	123	static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level)
	124	{
	125	int prev_id[CPUINFO_LVL_MAX];
	126	int i, n, num_nodes;
	127
	128	for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) {
	129	struct cpuinfo_level *lv = &tree_level[i];
	130
	131	prev_id[i] = -1;
	132	lv->start_index = lv->end_index = lv->num_nodes = 0;
	133	}
	134
	135	num_nodes = 1; /* Include the root node */
	136
	137	for (i = 0; i < num_possible_cpus(); i++) {
	138	if (!cpu_online(i))
	139	continue;
	140
	141	n = cpuinfo_id(i, CPUINFO_LVL_NODE);
	142	if (n > prev_id[CPUINFO_LVL_NODE]) {
	143	tree_level[CPUINFO_LVL_NODE].num_nodes++;
	144	prev_id[CPUINFO_LVL_NODE] = n;
	145	num_nodes++;
	146	}
	147	n = cpuinfo_id(i, CPUINFO_LVL_CORE);
	148	if (n > prev_id[CPUINFO_LVL_CORE]) {
	149	tree_level[CPUINFO_LVL_CORE].num_nodes++;
	150	prev_id[CPUINFO_LVL_CORE] = n;
	151	num_nodes++;
	152	}
	153	n = cpuinfo_id(i, CPUINFO_LVL_PROC);
	154	if (n > prev_id[CPUINFO_LVL_PROC]) {
	155	tree_level[CPUINFO_LVL_PROC].num_nodes++;
	156	prev_id[CPUINFO_LVL_PROC] = n;
	157	num_nodes++;
	158	}
	159	}
	160
	161	tree_level[CPUINFO_LVL_ROOT].num_nodes = 1;
	162
	163	n = tree_level[CPUINFO_LVL_NODE].num_nodes;
	164	tree_level[CPUINFO_LVL_NODE].start_index = 1;
	165	tree_level[CPUINFO_LVL_NODE].end_index = n;
	166
	167	n++;
	168	tree_level[CPUINFO_LVL_CORE].start_index = n;
	169	n += tree_level[CPUINFO_LVL_CORE].num_nodes;
	170	tree_level[CPUINFO_LVL_CORE].end_index = n - 1;
	171
	172	tree_level[CPUINFO_LVL_PROC].start_index = n;
	173	n += tree_level[CPUINFO_LVL_PROC].num_nodes;
	174	tree_level[CPUINFO_LVL_PROC].end_index = n - 1;
	175
	176	return num_nodes;
	177	}
	178
	179	/* Build a tree representation of the CPU hierarchy using the per CPU
	180	* information in __cpu_data. Entries in __cpu_data[0..NR_CPUS] are
	181	* assumed to be sorted in ascending order based on node, core_id, and
	182	* proc_id (in order of significance).
	183	*/
	184	static struct cpuinfo_tree *build_cpuinfo_tree(void)
	185	{
	186	struct cpuinfo_tree *new_tree;
	187	struct cpuinfo_node *node;
	188	struct cpuinfo_level tmp_level[CPUINFO_LVL_MAX];
	189	int num_cpus[CPUINFO_LVL_MAX];
	190	int level_rover[CPUINFO_LVL_MAX];
	191	int prev_id[CPUINFO_LVL_MAX];
	192	int n, id, cpu, prev_cpu, last_cpu, level;
	193
	194	n = enumerate_cpuinfo_nodes(tmp_level);
	195
	196	new_tree = kzalloc(sizeof(struct cpuinfo_tree) +
	197	(sizeof(struct cpuinfo_node) * n), GFP_ATOMIC);
	198	if (!new_tree)
	199	return NULL;
	200
	201	new_tree->total_nodes = n;
	202	memcpy(&new_tree->level, tmp_level, sizeof(tmp_level));
	203
	204	prev_cpu = cpu = first_cpu(cpu_online_map);
	205
	206	/* Initialize all levels in the tree with the first CPU */
	207	for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) {
	208	n = new_tree->level[level].start_index;
	209
	210	level_rover[level] = n;
	211	node = &new_tree->nodes[n];
	212
	213	id = cpuinfo_id(cpu, level);
	214	if (unlikely(id < 0)) {
	215	kfree(new_tree);
	216	return NULL;
	217	}
	218	node->id = id;
	219	node->level = level;
	220	node->num_cpus = 1;
	221
	222	node->parent_index = (level > CPUINFO_LVL_ROOT)
	223	? new_tree->level[level - 1].start_index : -1;
	224
	225	node->child_start = node->child_end = node->rover =
	226	(level == CPUINFO_LVL_PROC)
	227	? cpu : new_tree->level[level + 1].start_index;
	228
	229	prev_id[level] = node->id;
	230	num_cpus[level] = 1;
	231	}
	232
	233	for (last_cpu = (num_possible_cpus() - 1); last_cpu >= 0; last_cpu--) {
	234	if (cpu_online(last_cpu))
	235	break;
	236	}
	237
	238	while (++cpu <= last_cpu) {
	239	if (!cpu_online(cpu))
	240	continue;
	241
	242	for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT;
	243	level--) {
	244	id = cpuinfo_id(cpu, level);
	245	if (unlikely(id < 0)) {
	246	kfree(new_tree);
	247	return NULL;
	248	}
	249
	250	if ((id != prev_id[level]) \|\| (cpu == last_cpu)) {
	251	prev_id[level] = id;
	252	node = &new_tree->nodes[level_rover[level]];
	253	node->num_cpus = num_cpus[level];
	254	num_cpus[level] = 1;
	255
	256	if (cpu == last_cpu)
	257	node->num_cpus++;
	258
	259	/* Connect tree node to parent */
	260	if (level == CPUINFO_LVL_ROOT)
	261	node->parent_index = -1;
	262	else
	263	node->parent_index =
	264	level_rover[level - 1];
	265
	266	if (level == CPUINFO_LVL_PROC) {
	267	node->child_end =
	268	(cpu == last_cpu) ? cpu : prev_cpu;
	269	} else {
	270	node->child_end =
	271	level_rover[level + 1] - 1;
	272	}
	273
	274	/* Initialize the next node in the same level */
	275	n = ++level_rover[level];
	276	if (n <= new_tree->level[level].end_index) {
	277	node = &new_tree->nodes[n];
	278	node->id = id;
	279	node->level = level;
	280
	281	/* Connect node to child */
	282	node->child_start = node->child_end =
	283	node->rover =
	284	(level == CPUINFO_LVL_PROC)
	285	? cpu : level_rover[level + 1];
	286	}
	287	} else
	288	num_cpus[level]++;
	289	}
	290	prev_cpu = cpu;
	291	}
	292
	293	return new_tree;
	294	}
	295
	296	static void increment_rover(struct cpuinfo_tree *t, int node_index,
	297	int root_index, const int *rover_inc_table)
	298	{
	299	struct cpuinfo_node *node = &t->nodes[node_index];
	300	int top_level, level;
	301
	302	top_level = t->nodes[root_index].level;
	303	for (level = node->level; level >= top_level; level--) {
	304	node->rover++;
	305	if (node->rover <= node->child_end)
	306	return;
	307
	308	node->rover = node->child_start;
	309	/* If parent's rover does not need to be adjusted, stop here. */
	310	if ((level == top_level) \|\|
	311	!(rover_inc_table[level] & ROVER_INC_PARENT_ON_LOOP))
	312	return;
	313
	314	node = &t->nodes[node->parent_index];
	315	}
	316	}
	317
	318	static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index)
	319	{
	320	const int *rover_inc_table;
	321	int level, new_index, index = root_index;
	322
	323	switch (sun4v_chip_type) {
	324	case SUN4V_CHIP_NIAGARA1:
	325	case SUN4V_CHIP_NIAGARA2:
	326	rover_inc_table = niagara_iterate_method;
	327	break;
	328	default:
	329	rover_inc_table = generic_iterate_method;
	330	}
	331
	332	for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX;
	333	level++) {
	334	new_index = t->nodes[index].rover;
	335	if (rover_inc_table[level] & ROVER_INC_ON_VISIT)
	336	increment_rover(t, index, root_index, rover_inc_table);
	337
	338	index = new_index;
	339	}
	340	return index;
	341	}
	342
	343	static void _cpu_map_rebuild(void)
	344	{
	345	int i;
	346
	347	if (cpuinfo_tree) {
	348	kfree(cpuinfo_tree);
	349	cpuinfo_tree = NULL;
	350	}
	351
	352	cpuinfo_tree = build_cpuinfo_tree();
	353	if (!cpuinfo_tree)
	354	return;
	355
	356	/* Build CPU distribution map that spans all online CPUs. No need
	357	* to check if the CPU is online, as that is done when the cpuinfo
	358	* tree is being built.
	359	*/
	360	for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++)
	361	cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0);
	362	}
	363
	364	/* Fallback if the cpuinfo tree could not be built. CPU mapping is linear
	365	* round robin.
	366	*/
	367	static int simple_map_to_cpu(unsigned int index)
	368	{
	369	int i, end, cpu_rover;
	370
	371	cpu_rover = 0;
	372	end = index % num_online_cpus();
	373	for (i = 0; i < num_possible_cpus(); i++) {
	374	if (cpu_online(cpu_rover)) {
	375	if (cpu_rover >= end)
	376	return cpu_rover;
	377
	378	cpu_rover++;
	379	}
	380	}
	381
	382	/* Impossible, since num_online_cpus() <= num_possible_cpus() */
	383	return first_cpu(cpu_online_map);
	384	}
	385
	386	static int _map_to_cpu(unsigned int index)
	387	{
	388	struct cpuinfo_node *root_node;
	389
	390	if (unlikely(!cpuinfo_tree)) {
	391	_cpu_map_rebuild();
	392	if (!cpuinfo_tree)
	393	return simple_map_to_cpu(index);
	394	}
	395
	396	root_node = &cpuinfo_tree->nodes[0];
	397	#ifdef CONFIG_HOTPLUG_CPU
	398	if (unlikely(root_node->num_cpus != num_online_cpus())) {
	399	_cpu_map_rebuild();
	400	if (!cpuinfo_tree)
	401	return simple_map_to_cpu(index);
	402	}
	403	#endif
	404	return cpu_distribution_map[index % root_node->num_cpus];
	405	}
	406
	407	int map_to_cpu(unsigned int index)
	408	{
	409	int mapped_cpu;
	410	unsigned long flag;
	411
	412	spin_lock_irqsave(&cpu_map_lock, flag);
	413	mapped_cpu = _map_to_cpu(index);
	414
	415	#ifdef CONFIG_HOTPLUG_CPU
	416	while (unlikely(!cpu_online(mapped_cpu)))
	417	mapped_cpu = _map_to_cpu(index);
	418	#endif
	419	spin_unlock_irqrestore(&cpu_map_lock, flag);
	420	return mapped_cpu;
	421	}
	422	EXPORT_SYMBOL(map_to_cpu);
	423
	424	void cpu_map_rebuild(void)
	425	{
	426	unsigned long flag;
	427
	428	spin_lock_irqsave(&cpu_map_lock, flag);
	429	_cpu_map_rebuild();
	430	spin_unlock_irqrestore(&cpu_map_lock, flag);
	431	}