diff options
author | Hong H. Pham <hong.pham@windriver.com> | 2009-06-04 05:10:11 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2009-06-16 07:56:28 -0400 |
commit | 280ff97494e0fef4124bee5c52e39b23a18dd283 (patch) | |
tree | e906ca3c5e0a6238882d181ab5b01fb3f40ba5df /arch/sparc/kernel/cpumap.c | |
parent | 4fd78a5f1edf62ab1ca3d23efee4a8a336edb2b6 (diff) |
sparc64: fix and optimize irq distribution
irq_choose_cpu() should compare the affinity mask against cpu_online_map
rather than CPU_MASK_ALL, since irq_select_affinity() sets the interrupt's
affinity mask to cpu_online_map "and" CPU_MASK_ALL (which ends up being
just cpu_online_map). The mask comparison in irq_choose_cpu() will always
fail since the two masks are not the same. So the CPU chosen is the first CPU
in the intersection of cpu_online_map and CPU_MASK_ALL, which is always CPU0.
That means all interrupts are reassigned to CPU0...
Distributing interrupts to CPUs in a linearly increasing round robin fashion
is not optimal for the UltraSPARC T1/T2. Also, the irq_rover in
irq_choose_cpu() causes an interrupt to be assigned to a different
processor each time the interrupt is allocated and released. This may lead
to an unbalanced distribution over time.
A static mapping of interrupts to processors is done to optimize and balance
interrupt distribution. For the T1/T2, interrupts are spread to different
cores first, and then to strands within a core.
The following is some benchmarks showing the effects of interrupt
distribution on a T2. The test was done with iperf using a pair of T5220
boxes, each with a 10GBe NIU (XAUI) connected back to back.
TCP | Stock Linear RR IRQ Optimized IRQ
Streams | 2.6.30-rc5 Distribution Distribution
| GBits/sec GBits/sec GBits/sec
--------+-----------------------------------------
1 0.839 0.862 0.868
8 1.16 4.96 5.88
16 1.15 6.40 8.04
100 1.09 7.28 8.68
Signed-off-by: Hong H. Pham <hong.pham@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/sparc/kernel/cpumap.c')
-rw-r--r-- | arch/sparc/kernel/cpumap.c | 431 |
1 files changed, 431 insertions, 0 deletions
diff --git a/arch/sparc/kernel/cpumap.c b/arch/sparc/kernel/cpumap.c new file mode 100644 index 000000000000..7430ed080b23 --- /dev/null +++ b/arch/sparc/kernel/cpumap.c | |||
@@ -0,0 +1,431 @@ | |||
1 | /* cpumap.c: used for optimizing CPU assignment | ||
2 | * | ||
3 | * Copyright (C) 2009 Hong H. Pham <hong.pham@windriver.com> | ||
4 | */ | ||
5 | |||
6 | #include <linux/module.h> | ||
7 | #include <linux/kernel.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/cpumask.h> | ||
10 | #include <linux/spinlock.h> | ||
11 | #include <asm/cpudata.h> | ||
12 | #include "cpumap.h" | ||
13 | |||
14 | |||
15 | enum { | ||
16 | CPUINFO_LVL_ROOT = 0, | ||
17 | CPUINFO_LVL_NODE, | ||
18 | CPUINFO_LVL_CORE, | ||
19 | CPUINFO_LVL_PROC, | ||
20 | CPUINFO_LVL_MAX, | ||
21 | }; | ||
22 | |||
23 | enum { | ||
24 | ROVER_NO_OP = 0, | ||
25 | /* Increment rover every time level is visited */ | ||
26 | ROVER_INC_ON_VISIT = 1 << 0, | ||
27 | /* Increment parent's rover every time rover wraps around */ | ||
28 | ROVER_INC_PARENT_ON_LOOP = 1 << 1, | ||
29 | }; | ||
30 | |||
31 | struct cpuinfo_node { | ||
32 | int id; | ||
33 | int level; | ||
34 | int num_cpus; /* Number of CPUs in this hierarchy */ | ||
35 | int parent_index; | ||
36 | int child_start; /* Array index of the first child node */ | ||
37 | int child_end; /* Array index of the last child node */ | ||
38 | int rover; /* Child node iterator */ | ||
39 | }; | ||
40 | |||
41 | struct cpuinfo_level { | ||
42 | int start_index; /* Index of first node of a level in a cpuinfo tree */ | ||
43 | int end_index; /* Index of last node of a level in a cpuinfo tree */ | ||
44 | int num_nodes; /* Number of nodes in a level in a cpuinfo tree */ | ||
45 | }; | ||
46 | |||
47 | struct cpuinfo_tree { | ||
48 | int total_nodes; | ||
49 | |||
50 | /* Offsets into nodes[] for each level of the tree */ | ||
51 | struct cpuinfo_level level[CPUINFO_LVL_MAX]; | ||
52 | struct cpuinfo_node nodes[0]; | ||
53 | }; | ||
54 | |||
55 | |||
56 | static struct cpuinfo_tree *cpuinfo_tree; | ||
57 | |||
58 | static u16 cpu_distribution_map[NR_CPUS]; | ||
59 | static DEFINE_SPINLOCK(cpu_map_lock); | ||
60 | |||
61 | |||
62 | /* Niagara optimized cpuinfo tree traversal. */ | ||
63 | static const int niagara_iterate_method[] = { | ||
64 | [CPUINFO_LVL_ROOT] = ROVER_NO_OP, | ||
65 | |||
66 | /* Strands (or virtual CPUs) within a core may not run concurrently | ||
67 | * on the Niagara, as instruction pipeline(s) are shared. Distribute | ||
68 | * work to strands in different cores first for better concurrency. | ||
69 | * Go to next NUMA node when all cores are used. | ||
70 | */ | ||
71 | [CPUINFO_LVL_NODE] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP, | ||
72 | |||
73 | /* Strands are grouped together by proc_id in cpuinfo_sparc, i.e. | ||
74 | * a proc_id represents an instruction pipeline. Distribute work to | ||
75 | * strands in different proc_id groups if the core has multiple | ||
76 | * instruction pipelines (e.g. the Niagara 2/2+ has two). | ||
77 | */ | ||
78 | [CPUINFO_LVL_CORE] = ROVER_INC_ON_VISIT, | ||
79 | |||
80 | /* Pick the next strand in the proc_id group. */ | ||
81 | [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT, | ||
82 | }; | ||
83 | |||
84 | /* Generic cpuinfo tree traversal. Distribute work round robin across NUMA | ||
85 | * nodes. | ||
86 | */ | ||
87 | static const int generic_iterate_method[] = { | ||
88 | [CPUINFO_LVL_ROOT] = ROVER_INC_ON_VISIT, | ||
89 | [CPUINFO_LVL_NODE] = ROVER_NO_OP, | ||
90 | [CPUINFO_LVL_CORE] = ROVER_INC_PARENT_ON_LOOP, | ||
91 | [CPUINFO_LVL_PROC] = ROVER_INC_ON_VISIT|ROVER_INC_PARENT_ON_LOOP, | ||
92 | }; | ||
93 | |||
94 | |||
95 | static int cpuinfo_id(int cpu, int level) | ||
96 | { | ||
97 | int id; | ||
98 | |||
99 | switch (level) { | ||
100 | case CPUINFO_LVL_ROOT: | ||
101 | id = 0; | ||
102 | break; | ||
103 | case CPUINFO_LVL_NODE: | ||
104 | id = cpu_to_node(cpu); | ||
105 | break; | ||
106 | case CPUINFO_LVL_CORE: | ||
107 | id = cpu_data(cpu).core_id; | ||
108 | break; | ||
109 | case CPUINFO_LVL_PROC: | ||
110 | id = cpu_data(cpu).proc_id; | ||
111 | break; | ||
112 | default: | ||
113 | id = -EINVAL; | ||
114 | } | ||
115 | return id; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Enumerate the CPU information in __cpu_data to determine the start index, | ||
120 | * end index, and number of nodes for each level in the cpuinfo tree. The | ||
121 | * total number of cpuinfo nodes required to build the tree is returned. | ||
122 | */ | ||
123 | static int enumerate_cpuinfo_nodes(struct cpuinfo_level *tree_level) | ||
124 | { | ||
125 | int prev_id[CPUINFO_LVL_MAX]; | ||
126 | int i, n, num_nodes; | ||
127 | |||
128 | for (i = CPUINFO_LVL_ROOT; i < CPUINFO_LVL_MAX; i++) { | ||
129 | struct cpuinfo_level *lv = &tree_level[i]; | ||
130 | |||
131 | prev_id[i] = -1; | ||
132 | lv->start_index = lv->end_index = lv->num_nodes = 0; | ||
133 | } | ||
134 | |||
135 | num_nodes = 1; /* Include the root node */ | ||
136 | |||
137 | for (i = 0; i < num_possible_cpus(); i++) { | ||
138 | if (!cpu_online(i)) | ||
139 | continue; | ||
140 | |||
141 | n = cpuinfo_id(i, CPUINFO_LVL_NODE); | ||
142 | if (n > prev_id[CPUINFO_LVL_NODE]) { | ||
143 | tree_level[CPUINFO_LVL_NODE].num_nodes++; | ||
144 | prev_id[CPUINFO_LVL_NODE] = n; | ||
145 | num_nodes++; | ||
146 | } | ||
147 | n = cpuinfo_id(i, CPUINFO_LVL_CORE); | ||
148 | if (n > prev_id[CPUINFO_LVL_CORE]) { | ||
149 | tree_level[CPUINFO_LVL_CORE].num_nodes++; | ||
150 | prev_id[CPUINFO_LVL_CORE] = n; | ||
151 | num_nodes++; | ||
152 | } | ||
153 | n = cpuinfo_id(i, CPUINFO_LVL_PROC); | ||
154 | if (n > prev_id[CPUINFO_LVL_PROC]) { | ||
155 | tree_level[CPUINFO_LVL_PROC].num_nodes++; | ||
156 | prev_id[CPUINFO_LVL_PROC] = n; | ||
157 | num_nodes++; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | tree_level[CPUINFO_LVL_ROOT].num_nodes = 1; | ||
162 | |||
163 | n = tree_level[CPUINFO_LVL_NODE].num_nodes; | ||
164 | tree_level[CPUINFO_LVL_NODE].start_index = 1; | ||
165 | tree_level[CPUINFO_LVL_NODE].end_index = n; | ||
166 | |||
167 | n++; | ||
168 | tree_level[CPUINFO_LVL_CORE].start_index = n; | ||
169 | n += tree_level[CPUINFO_LVL_CORE].num_nodes; | ||
170 | tree_level[CPUINFO_LVL_CORE].end_index = n - 1; | ||
171 | |||
172 | tree_level[CPUINFO_LVL_PROC].start_index = n; | ||
173 | n += tree_level[CPUINFO_LVL_PROC].num_nodes; | ||
174 | tree_level[CPUINFO_LVL_PROC].end_index = n - 1; | ||
175 | |||
176 | return num_nodes; | ||
177 | } | ||
178 | |||
179 | /* Build a tree representation of the CPU hierarchy using the per CPU | ||
180 | * information in __cpu_data. Entries in __cpu_data[0..NR_CPUS] are | ||
181 | * assumed to be sorted in ascending order based on node, core_id, and | ||
182 | * proc_id (in order of significance). | ||
183 | */ | ||
184 | static struct cpuinfo_tree *build_cpuinfo_tree(void) | ||
185 | { | ||
186 | struct cpuinfo_tree *new_tree; | ||
187 | struct cpuinfo_node *node; | ||
188 | struct cpuinfo_level tmp_level[CPUINFO_LVL_MAX]; | ||
189 | int num_cpus[CPUINFO_LVL_MAX]; | ||
190 | int level_rover[CPUINFO_LVL_MAX]; | ||
191 | int prev_id[CPUINFO_LVL_MAX]; | ||
192 | int n, id, cpu, prev_cpu, last_cpu, level; | ||
193 | |||
194 | n = enumerate_cpuinfo_nodes(tmp_level); | ||
195 | |||
196 | new_tree = kzalloc(sizeof(struct cpuinfo_tree) + | ||
197 | (sizeof(struct cpuinfo_node) * n), GFP_ATOMIC); | ||
198 | if (!new_tree) | ||
199 | return NULL; | ||
200 | |||
201 | new_tree->total_nodes = n; | ||
202 | memcpy(&new_tree->level, tmp_level, sizeof(tmp_level)); | ||
203 | |||
204 | prev_cpu = cpu = first_cpu(cpu_online_map); | ||
205 | |||
206 | /* Initialize all levels in the tree with the first CPU */ | ||
207 | for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; level--) { | ||
208 | n = new_tree->level[level].start_index; | ||
209 | |||
210 | level_rover[level] = n; | ||
211 | node = &new_tree->nodes[n]; | ||
212 | |||
213 | id = cpuinfo_id(cpu, level); | ||
214 | if (unlikely(id < 0)) { | ||
215 | kfree(new_tree); | ||
216 | return NULL; | ||
217 | } | ||
218 | node->id = id; | ||
219 | node->level = level; | ||
220 | node->num_cpus = 1; | ||
221 | |||
222 | node->parent_index = (level > CPUINFO_LVL_ROOT) | ||
223 | ? new_tree->level[level - 1].start_index : -1; | ||
224 | |||
225 | node->child_start = node->child_end = node->rover = | ||
226 | (level == CPUINFO_LVL_PROC) | ||
227 | ? cpu : new_tree->level[level + 1].start_index; | ||
228 | |||
229 | prev_id[level] = node->id; | ||
230 | num_cpus[level] = 1; | ||
231 | } | ||
232 | |||
233 | for (last_cpu = (num_possible_cpus() - 1); last_cpu >= 0; last_cpu--) { | ||
234 | if (cpu_online(last_cpu)) | ||
235 | break; | ||
236 | } | ||
237 | |||
238 | while (++cpu <= last_cpu) { | ||
239 | if (!cpu_online(cpu)) | ||
240 | continue; | ||
241 | |||
242 | for (level = CPUINFO_LVL_PROC; level >= CPUINFO_LVL_ROOT; | ||
243 | level--) { | ||
244 | id = cpuinfo_id(cpu, level); | ||
245 | if (unlikely(id < 0)) { | ||
246 | kfree(new_tree); | ||
247 | return NULL; | ||
248 | } | ||
249 | |||
250 | if ((id != prev_id[level]) || (cpu == last_cpu)) { | ||
251 | prev_id[level] = id; | ||
252 | node = &new_tree->nodes[level_rover[level]]; | ||
253 | node->num_cpus = num_cpus[level]; | ||
254 | num_cpus[level] = 1; | ||
255 | |||
256 | if (cpu == last_cpu) | ||
257 | node->num_cpus++; | ||
258 | |||
259 | /* Connect tree node to parent */ | ||
260 | if (level == CPUINFO_LVL_ROOT) | ||
261 | node->parent_index = -1; | ||
262 | else | ||
263 | node->parent_index = | ||
264 | level_rover[level - 1]; | ||
265 | |||
266 | if (level == CPUINFO_LVL_PROC) { | ||
267 | node->child_end = | ||
268 | (cpu == last_cpu) ? cpu : prev_cpu; | ||
269 | } else { | ||
270 | node->child_end = | ||
271 | level_rover[level + 1] - 1; | ||
272 | } | ||
273 | |||
274 | /* Initialize the next node in the same level */ | ||
275 | n = ++level_rover[level]; | ||
276 | if (n <= new_tree->level[level].end_index) { | ||
277 | node = &new_tree->nodes[n]; | ||
278 | node->id = id; | ||
279 | node->level = level; | ||
280 | |||
281 | /* Connect node to child */ | ||
282 | node->child_start = node->child_end = | ||
283 | node->rover = | ||
284 | (level == CPUINFO_LVL_PROC) | ||
285 | ? cpu : level_rover[level + 1]; | ||
286 | } | ||
287 | } else | ||
288 | num_cpus[level]++; | ||
289 | } | ||
290 | prev_cpu = cpu; | ||
291 | } | ||
292 | |||
293 | return new_tree; | ||
294 | } | ||
295 | |||
296 | static void increment_rover(struct cpuinfo_tree *t, int node_index, | ||
297 | int root_index, const int *rover_inc_table) | ||
298 | { | ||
299 | struct cpuinfo_node *node = &t->nodes[node_index]; | ||
300 | int top_level, level; | ||
301 | |||
302 | top_level = t->nodes[root_index].level; | ||
303 | for (level = node->level; level >= top_level; level--) { | ||
304 | node->rover++; | ||
305 | if (node->rover <= node->child_end) | ||
306 | return; | ||
307 | |||
308 | node->rover = node->child_start; | ||
309 | /* If parent's rover does not need to be adjusted, stop here. */ | ||
310 | if ((level == top_level) || | ||
311 | !(rover_inc_table[level] & ROVER_INC_PARENT_ON_LOOP)) | ||
312 | return; | ||
313 | |||
314 | node = &t->nodes[node->parent_index]; | ||
315 | } | ||
316 | } | ||
317 | |||
318 | static int iterate_cpu(struct cpuinfo_tree *t, unsigned int root_index) | ||
319 | { | ||
320 | const int *rover_inc_table; | ||
321 | int level, new_index, index = root_index; | ||
322 | |||
323 | switch (sun4v_chip_type) { | ||
324 | case SUN4V_CHIP_NIAGARA1: | ||
325 | case SUN4V_CHIP_NIAGARA2: | ||
326 | rover_inc_table = niagara_iterate_method; | ||
327 | break; | ||
328 | default: | ||
329 | rover_inc_table = generic_iterate_method; | ||
330 | } | ||
331 | |||
332 | for (level = t->nodes[root_index].level; level < CPUINFO_LVL_MAX; | ||
333 | level++) { | ||
334 | new_index = t->nodes[index].rover; | ||
335 | if (rover_inc_table[level] & ROVER_INC_ON_VISIT) | ||
336 | increment_rover(t, index, root_index, rover_inc_table); | ||
337 | |||
338 | index = new_index; | ||
339 | } | ||
340 | return index; | ||
341 | } | ||
342 | |||
343 | static void _cpu_map_rebuild(void) | ||
344 | { | ||
345 | int i; | ||
346 | |||
347 | if (cpuinfo_tree) { | ||
348 | kfree(cpuinfo_tree); | ||
349 | cpuinfo_tree = NULL; | ||
350 | } | ||
351 | |||
352 | cpuinfo_tree = build_cpuinfo_tree(); | ||
353 | if (!cpuinfo_tree) | ||
354 | return; | ||
355 | |||
356 | /* Build CPU distribution map that spans all online CPUs. No need | ||
357 | * to check if the CPU is online, as that is done when the cpuinfo | ||
358 | * tree is being built. | ||
359 | */ | ||
360 | for (i = 0; i < cpuinfo_tree->nodes[0].num_cpus; i++) | ||
361 | cpu_distribution_map[i] = iterate_cpu(cpuinfo_tree, 0); | ||
362 | } | ||
363 | |||
364 | /* Fallback if the cpuinfo tree could not be built. CPU mapping is linear | ||
365 | * round robin. | ||
366 | */ | ||
367 | static int simple_map_to_cpu(unsigned int index) | ||
368 | { | ||
369 | int i, end, cpu_rover; | ||
370 | |||
371 | cpu_rover = 0; | ||
372 | end = index % num_online_cpus(); | ||
373 | for (i = 0; i < num_possible_cpus(); i++) { | ||
374 | if (cpu_online(cpu_rover)) { | ||
375 | if (cpu_rover >= end) | ||
376 | return cpu_rover; | ||
377 | |||
378 | cpu_rover++; | ||
379 | } | ||
380 | } | ||
381 | |||
382 | /* Impossible, since num_online_cpus() <= num_possible_cpus() */ | ||
383 | return first_cpu(cpu_online_map); | ||
384 | } | ||
385 | |||
386 | static int _map_to_cpu(unsigned int index) | ||
387 | { | ||
388 | struct cpuinfo_node *root_node; | ||
389 | |||
390 | if (unlikely(!cpuinfo_tree)) { | ||
391 | _cpu_map_rebuild(); | ||
392 | if (!cpuinfo_tree) | ||
393 | return simple_map_to_cpu(index); | ||
394 | } | ||
395 | |||
396 | root_node = &cpuinfo_tree->nodes[0]; | ||
397 | #ifdef CONFIG_HOTPLUG_CPU | ||
398 | if (unlikely(root_node->num_cpus != num_online_cpus())) { | ||
399 | _cpu_map_rebuild(); | ||
400 | if (!cpuinfo_tree) | ||
401 | return simple_map_to_cpu(index); | ||
402 | } | ||
403 | #endif | ||
404 | return cpu_distribution_map[index % root_node->num_cpus]; | ||
405 | } | ||
406 | |||
407 | int map_to_cpu(unsigned int index) | ||
408 | { | ||
409 | int mapped_cpu; | ||
410 | unsigned long flag; | ||
411 | |||
412 | spin_lock_irqsave(&cpu_map_lock, flag); | ||
413 | mapped_cpu = _map_to_cpu(index); | ||
414 | |||
415 | #ifdef CONFIG_HOTPLUG_CPU | ||
416 | while (unlikely(!cpu_online(mapped_cpu))) | ||
417 | mapped_cpu = _map_to_cpu(index); | ||
418 | #endif | ||
419 | spin_unlock_irqrestore(&cpu_map_lock, flag); | ||
420 | return mapped_cpu; | ||
421 | } | ||
422 | EXPORT_SYMBOL(map_to_cpu); | ||
423 | |||
424 | void cpu_map_rebuild(void) | ||
425 | { | ||
426 | unsigned long flag; | ||
427 | |||
428 | spin_lock_irqsave(&cpu_map_lock, flag); | ||
429 | _cpu_map_rebuild(); | ||
430 | spin_unlock_irqrestore(&cpu_map_lock, flag); | ||
431 | } | ||