aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/ia64/kernel/Makefile2
-rw-r--r--arch/ia64/kernel/domain.c444
-rw-r--r--include/asm-ia64/processor.h3
-rw-r--r--include/asm-ia64/topology.h23
-rw-r--r--include/linux/sched.h7
-rw-r--r--include/linux/topology.h23
-rw-r--r--kernel/sched.c290
7 files changed, 260 insertions, 532 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index b242594be55b..307514f7a282 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acpi-ext.o
16obj-$(CONFIG_IA64_PALINFO) += palinfo.o 16obj-$(CONFIG_IA64_PALINFO) += palinfo.o
17obj-$(CONFIG_IOSAPIC) += iosapic.o 17obj-$(CONFIG_IOSAPIC) += iosapic.o
18obj-$(CONFIG_MODULES) += module.o 18obj-$(CONFIG_MODULES) += module.o
19obj-$(CONFIG_SMP) += smp.o smpboot.o domain.o 19obj-$(CONFIG_SMP) += smp.o smpboot.o
20obj-$(CONFIG_NUMA) += numa.o 20obj-$(CONFIG_NUMA) += numa.o
21obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o 21obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o
22obj-$(CONFIG_IA64_CYCLONE) += cyclone.o 22obj-$(CONFIG_IA64_CYCLONE) += cyclone.o
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
deleted file mode 100644
index e907109983f1..000000000000
--- a/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,444 +0,0 @@
1/*
2 * arch/ia64/kernel/domain.c
3 * Architecture specific sched-domains builder.
4 *
5 * Copyright (C) 2004 Jesse Barnes
6 * Copyright (C) 2004 Silicon Graphics, Inc.
7 */
8
9#include <linux/sched.h>
10#include <linux/percpu.h>
11#include <linux/slab.h>
12#include <linux/cpumask.h>
13#include <linux/init.h>
14#include <linux/topology.h>
15#include <linux/nodemask.h>
16
17#define SD_NODES_PER_DOMAIN 16
18
19#ifdef CONFIG_NUMA
20/**
21 * find_next_best_node - find the next node to include in a sched_domain
22 * @node: node whose sched_domain we're building
23 * @used_nodes: nodes already in the sched_domain
24 *
25 * Find the next node to include in a given scheduling domain. Simply
26 * finds the closest node not already in the @used_nodes map.
27 *
28 * Should use nodemask_t.
29 */
30static int find_next_best_node(int node, unsigned long *used_nodes)
31{
32 int i, n, val, min_val, best_node = 0;
33
34 min_val = INT_MAX;
35
36 for (i = 0; i < MAX_NUMNODES; i++) {
37 /* Start at @node */
38 n = (node + i) % MAX_NUMNODES;
39
40 if (!nr_cpus_node(n))
41 continue;
42
43 /* Skip already used nodes */
44 if (test_bit(n, used_nodes))
45 continue;
46
47 /* Simple min distance search */
48 val = node_distance(node, n);
49
50 if (val < min_val) {
51 min_val = val;
52 best_node = n;
53 }
54 }
55
56 set_bit(best_node, used_nodes);
57 return best_node;
58}
59
60/**
61 * sched_domain_node_span - get a cpumask for a node's sched_domain
62 * @node: node whose cpumask we're constructing
63 * @size: number of nodes to include in this span
64 *
65 * Given a node, construct a good cpumask for its sched_domain to span. It
66 * should be one that prevents unnecessary balancing, but also spreads tasks
67 * out optimally.
68 */
69static cpumask_t sched_domain_node_span(int node)
70{
71 int i;
72 cpumask_t span, nodemask;
73 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
74
75 cpus_clear(span);
76 bitmap_zero(used_nodes, MAX_NUMNODES);
77
78 nodemask = node_to_cpumask(node);
79 cpus_or(span, span, nodemask);
80 set_bit(node, used_nodes);
81
82 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
83 int next_node = find_next_best_node(node, used_nodes);
84 nodemask = node_to_cpumask(next_node);
85 cpus_or(span, span, nodemask);
86 }
87
88 return span;
89}
90#endif
91
92/*
93 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
94 * can switch it on easily if needed.
95 */
96#ifdef CONFIG_SCHED_SMT
97static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
98static struct sched_group sched_group_cpus[NR_CPUS];
99static int cpu_to_cpu_group(int cpu)
100{
101 return cpu;
102}
103#endif
104
105static DEFINE_PER_CPU(struct sched_domain, phys_domains);
106static struct sched_group sched_group_phys[NR_CPUS];
107static int cpu_to_phys_group(int cpu)
108{
109#ifdef CONFIG_SCHED_SMT
110 return first_cpu(cpu_sibling_map[cpu]);
111#else
112 return cpu;
113#endif
114}
115
116#ifdef CONFIG_NUMA
117/*
118 * The init_sched_build_groups can't handle what we want to do with node
119 * groups, so roll our own. Now each node has its own list of groups which
120 * gets dynamically allocated.
121 */
122static DEFINE_PER_CPU(struct sched_domain, node_domains);
123static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
124
125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
127
128static int cpu_to_allnodes_group(int cpu)
129{
130 return cpu_to_node(cpu);
131}
132#endif
133
134/*
135 * Build sched domains for a given set of cpus and attach the sched domains
136 * to the individual cpus
137 */
138void build_sched_domains(const cpumask_t *cpu_map)
139{
140 int i;
141#ifdef CONFIG_NUMA
142 struct sched_group **sched_group_nodes = NULL;
143 struct sched_group *sched_group_allnodes = NULL;
144
145 /*
146 * Allocate the per-node list of sched groups
147 */
148 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
149 GFP_ATOMIC);
150 if (!sched_group_nodes) {
151 printk(KERN_WARNING "Can not alloc sched group node list\n");
152 return;
153 }
154 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
155#endif
156
157 /*
158 * Set up domains for cpus specified by the cpu_map.
159 */
160 for_each_cpu_mask(i, *cpu_map) {
161 int group;
162 struct sched_domain *sd = NULL, *p;
163 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
164
165 cpus_and(nodemask, nodemask, *cpu_map);
166
167#ifdef CONFIG_NUMA
168 if (cpus_weight(*cpu_map)
169 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
170 if (!sched_group_allnodes) {
171 sched_group_allnodes
172 = kmalloc(sizeof(struct sched_group)
173 * MAX_NUMNODES,
174 GFP_KERNEL);
175 if (!sched_group_allnodes) {
176 printk(KERN_WARNING
177 "Can not alloc allnodes sched group\n");
178 break;
179 }
180 sched_group_allnodes_bycpu[i]
181 = sched_group_allnodes;
182 }
183 sd = &per_cpu(allnodes_domains, i);
184 *sd = SD_ALLNODES_INIT;
185 sd->span = *cpu_map;
186 group = cpu_to_allnodes_group(i);
187 sd->groups = &sched_group_allnodes[group];
188 p = sd;
189 } else
190 p = NULL;
191
192 sd = &per_cpu(node_domains, i);
193 *sd = SD_NODE_INIT;
194 sd->span = sched_domain_node_span(cpu_to_node(i));
195 sd->parent = p;
196 cpus_and(sd->span, sd->span, *cpu_map);
197#endif
198
199 p = sd;
200 sd = &per_cpu(phys_domains, i);
201 group = cpu_to_phys_group(i);
202 *sd = SD_CPU_INIT;
203 sd->span = nodemask;
204 sd->parent = p;
205 sd->groups = &sched_group_phys[group];
206
207#ifdef CONFIG_SCHED_SMT
208 p = sd;
209 sd = &per_cpu(cpu_domains, i);
210 group = cpu_to_cpu_group(i);
211 *sd = SD_SIBLING_INIT;
212 sd->span = cpu_sibling_map[i];
213 cpus_and(sd->span, sd->span, *cpu_map);
214 sd->parent = p;
215 sd->groups = &sched_group_cpus[group];
216#endif
217 }
218
219#ifdef CONFIG_SCHED_SMT
220 /* Set up CPU (sibling) groups */
221 for_each_cpu_mask(i, *cpu_map) {
222 cpumask_t this_sibling_map = cpu_sibling_map[i];
223 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
224 if (i != first_cpu(this_sibling_map))
225 continue;
226
227 init_sched_build_groups(sched_group_cpus, this_sibling_map,
228 &cpu_to_cpu_group);
229 }
230#endif
231
232 /* Set up physical groups */
233 for (i = 0; i < MAX_NUMNODES; i++) {
234 cpumask_t nodemask = node_to_cpumask(i);
235
236 cpus_and(nodemask, nodemask, *cpu_map);
237 if (cpus_empty(nodemask))
238 continue;
239
240 init_sched_build_groups(sched_group_phys, nodemask,
241 &cpu_to_phys_group);
242 }
243
244#ifdef CONFIG_NUMA
245 if (sched_group_allnodes)
246 init_sched_build_groups(sched_group_allnodes, *cpu_map,
247 &cpu_to_allnodes_group);
248
249 for (i = 0; i < MAX_NUMNODES; i++) {
250 /* Set up node groups */
251 struct sched_group *sg, *prev;
252 cpumask_t nodemask = node_to_cpumask(i);
253 cpumask_t domainspan;
254 cpumask_t covered = CPU_MASK_NONE;
255 int j;
256
257 cpus_and(nodemask, nodemask, *cpu_map);
258 if (cpus_empty(nodemask)) {
259 sched_group_nodes[i] = NULL;
260 continue;
261 }
262
263 domainspan = sched_domain_node_span(i);
264 cpus_and(domainspan, domainspan, *cpu_map);
265
266 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
267 sched_group_nodes[i] = sg;
268 for_each_cpu_mask(j, nodemask) {
269 struct sched_domain *sd;
270 sd = &per_cpu(node_domains, j);
271 sd->groups = sg;
272 if (sd->groups == NULL) {
273 /* Turn off balancing if we have no groups */
274 sd->flags = 0;
275 }
276 }
277 if (!sg) {
278 printk(KERN_WARNING
279 "Can not alloc domain group for node %d\n", i);
280 continue;
281 }
282 sg->cpu_power = 0;
283 sg->cpumask = nodemask;
284 cpus_or(covered, covered, nodemask);
285 prev = sg;
286
287 for (j = 0; j < MAX_NUMNODES; j++) {
288 cpumask_t tmp, notcovered;
289 int n = (i + j) % MAX_NUMNODES;
290
291 cpus_complement(notcovered, covered);
292 cpus_and(tmp, notcovered, *cpu_map);
293 cpus_and(tmp, tmp, domainspan);
294 if (cpus_empty(tmp))
295 break;
296
297 nodemask = node_to_cpumask(n);
298 cpus_and(tmp, tmp, nodemask);
299 if (cpus_empty(tmp))
300 continue;
301
302 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
303 if (!sg) {
304 printk(KERN_WARNING
305 "Can not alloc domain group for node %d\n", j);
306 break;
307 }
308 sg->cpu_power = 0;
309 sg->cpumask = tmp;
310 cpus_or(covered, covered, tmp);
311 prev->next = sg;
312 prev = sg;
313 }
314 prev->next = sched_group_nodes[i];
315 }
316#endif
317
318 /* Calculate CPU power for physical packages and nodes */
319 for_each_cpu_mask(i, *cpu_map) {
320 int power;
321 struct sched_domain *sd;
322#ifdef CONFIG_SCHED_SMT
323 sd = &per_cpu(cpu_domains, i);
324 power = SCHED_LOAD_SCALE;
325 sd->groups->cpu_power = power;
326#endif
327
328 sd = &per_cpu(phys_domains, i);
329 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
330 (cpus_weight(sd->groups->cpumask)-1) / 10;
331 sd->groups->cpu_power = power;
332
333#ifdef CONFIG_NUMA
334 sd = &per_cpu(allnodes_domains, i);
335 if (sd->groups) {
336 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
337 (cpus_weight(sd->groups->cpumask)-1) / 10;
338 sd->groups->cpu_power = power;
339 }
340#endif
341 }
342
343#ifdef CONFIG_NUMA
344 for (i = 0; i < MAX_NUMNODES; i++) {
345 struct sched_group *sg = sched_group_nodes[i];
346 int j;
347
348 if (sg == NULL)
349 continue;
350next_sg:
351 for_each_cpu_mask(j, sg->cpumask) {
352 struct sched_domain *sd;
353 int power;
354
355 sd = &per_cpu(phys_domains, j);
356 if (j != first_cpu(sd->groups->cpumask)) {
357 /*
358 * Only add "power" once for each
359 * physical package.
360 */
361 continue;
362 }
363 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
364 (cpus_weight(sd->groups->cpumask)-1) / 10;
365
366 sg->cpu_power += power;
367 }
368 sg = sg->next;
369 if (sg != sched_group_nodes[i])
370 goto next_sg;
371 }
372#endif
373
374 /* Attach the domains */
375 for_each_cpu_mask(i, *cpu_map) {
376 struct sched_domain *sd;
377#ifdef CONFIG_SCHED_SMT
378 sd = &per_cpu(cpu_domains, i);
379#else
380 sd = &per_cpu(phys_domains, i);
381#endif
382 cpu_attach_domain(sd, i);
383 }
384}
385/*
386 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
387 */
388void arch_init_sched_domains(const cpumask_t *cpu_map)
389{
390 cpumask_t cpu_default_map;
391
392 /*
393 * Setup mask for cpus without special case scheduling requirements.
394 * For now this just excludes isolated cpus, but could be used to
395 * exclude other special cases in the future.
396 */
397 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
398
399 build_sched_domains(&cpu_default_map);
400}
401
402void arch_destroy_sched_domains(const cpumask_t *cpu_map)
403{
404#ifdef CONFIG_NUMA
405 int i;
406 int cpu;
407
408 for_each_cpu_mask(cpu, *cpu_map) {
409 struct sched_group *sched_group_allnodes
410 = sched_group_allnodes_bycpu[cpu];
411 struct sched_group **sched_group_nodes
412 = sched_group_nodes_bycpu[cpu];
413
414 if (sched_group_allnodes) {
415 kfree(sched_group_allnodes);
416 sched_group_allnodes_bycpu[cpu] = NULL;
417 }
418
419 if (!sched_group_nodes)
420 continue;
421
422 for (i = 0; i < MAX_NUMNODES; i++) {
423 cpumask_t nodemask = node_to_cpumask(i);
424 struct sched_group *oldsg, *sg = sched_group_nodes[i];
425
426 cpus_and(nodemask, nodemask, *cpu_map);
427 if (cpus_empty(nodemask))
428 continue;
429
430 if (sg == NULL)
431 continue;
432 sg = sg->next;
433next_sg:
434 oldsg = sg;
435 sg = sg->next;
436 kfree(oldsg);
437 if (oldsg != sched_group_nodes[i])
438 goto next_sg;
439 }
440 kfree(sched_group_nodes);
441 sched_group_nodes_bycpu[cpu] = NULL;
442 }
443#endif
444}
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 91bbd1f22461..94e07e727395 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
20#include <asm/ptrace.h> 20#include <asm/ptrace.h>
21#include <asm/ustack.h> 21#include <asm/ustack.h>
22 22
23/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
24#define ARCH_HAS_SCHED_DOMAIN
25
26#define IA64_NUM_DBG_REGS 8 23#define IA64_NUM_DBG_REGS 8
27/* 24/*
28 * Limits for PMC and PMD are set to less than maximum architected values 25 * Limits for PMC and PMD are set to less than maximum architected values
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 399bc29729fd..a9f738bf18a7 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -98,29 +98,6 @@ void build_cpu_to_node_map(void);
98 .nr_balance_failed = 0, \ 98 .nr_balance_failed = 0, \
99} 99}
100 100
101/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
102#define SD_ALLNODES_INIT (struct sched_domain) { \
103 .span = CPU_MASK_NONE, \
104 .parent = NULL, \
105 .groups = NULL, \
106 .min_interval = 64, \
107 .max_interval = 64*num_online_cpus(), \
108 .busy_factor = 128, \
109 .imbalance_pct = 133, \
110 .cache_hot_time = (10*1000000), \
111 .cache_nice_tries = 1, \
112 .busy_idx = 3, \
113 .idle_idx = 3, \
114 .newidle_idx = 0, /* unused */ \
115 .wake_idx = 0, /* unused */ \
116 .forkexec_idx = 0, /* unused */ \
117 .per_cpu_gain = 100, \
118 .flags = SD_LOAD_BALANCE, \
119 .last_balance = jiffies, \
120 .balance_interval = 64, \
121 .nr_balance_failed = 0, \
122}
123
124#endif /* CONFIG_NUMA */ 101#endif /* CONFIG_NUMA */
125 102
126#include <asm-generic/topology.h> 103#include <asm-generic/topology.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5a22ea80045..ea1b5f32ec5c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -564,13 +564,6 @@ struct sched_domain {
564 564
565extern void partition_sched_domains(cpumask_t *partition1, 565extern void partition_sched_domains(cpumask_t *partition1,
566 cpumask_t *partition2); 566 cpumask_t *partition2);
567#ifdef ARCH_HAS_SCHED_DOMAIN
568/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
569extern cpumask_t cpu_isolated_map;
570extern void init_sched_build_groups(struct sched_group groups[],
571 cpumask_t span, int (*group_fn)(int cpu));
572extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
573#endif /* ARCH_HAS_SCHED_DOMAIN */
574#endif /* CONFIG_SMP */ 567#endif /* CONFIG_SMP */
575 568
576 569
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 0320225e96da..3df1d474e5c5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -135,6 +135,29 @@
135} 135}
136#endif 136#endif
137 137
138/* sched_domains SD_ALLNODES_INIT for NUMA machines */
139#define SD_ALLNODES_INIT (struct sched_domain) { \
140 .span = CPU_MASK_NONE, \
141 .parent = NULL, \
142 .groups = NULL, \
143 .min_interval = 64, \
144 .max_interval = 64*num_online_cpus(), \
145 .busy_factor = 128, \
146 .imbalance_pct = 133, \
147 .cache_hot_time = (10*1000000), \
148 .cache_nice_tries = 1, \
149 .busy_idx = 3, \
150 .idle_idx = 3, \
151 .newidle_idx = 0, /* unused */ \
152 .wake_idx = 0, /* unused */ \
153 .forkexec_idx = 0, /* unused */ \
154 .per_cpu_gain = 100, \
155 .flags = SD_LOAD_BALANCE, \
156 .last_balance = jiffies, \
157 .balance_interval = 64, \
158 .nr_balance_failed = 0, \
159}
160
138#ifdef CONFIG_NUMA 161#ifdef CONFIG_NUMA
139#ifndef SD_NODE_INIT 162#ifndef SD_NODE_INIT
140#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 163#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..50860ad5b624 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4780 * hold the hotplug lock.
4781 */ 4781 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4782static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4783{
4784 runqueue_t *rq = cpu_rq(cpu); 4784 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4785 struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4802}
4803 4803
4804/* cpus with isolated domains */ 4804/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4805static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4806
4807/* Setup the mask of cpus configured for isolated domains */ 4807/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4808static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4830 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4831 * and ->cpu_power to 0.
4832 */ 4832 */
4833void init_sched_build_groups(struct sched_group groups[], 4833static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4834 int (*group_fn)(int cpu))
4835{ 4835{
4836 struct sched_group *first = NULL, *last = NULL; 4836 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4837 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4864 last->next = first;
4865} 4865}
4866 4866
4867#define SD_NODES_PER_DOMAIN 16
4867 4868
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4869#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4870/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4871 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4872 * @node: node whose sched_domain we're building
4872#else 4873 * @used_nodes: nodes already in the sched_domain
4874 *
4875 * Find the next node to include in a given scheduling domain. Simply
4876 * finds the closest node not already in the @used_nodes map.
4877 *
4878 * Should use nodemask_t.
4879 */
4880static int find_next_best_node(int node, unsigned long *used_nodes)
4881{
4882 int i, n, val, min_val, best_node = 0;
4883
4884 min_val = INT_MAX;
4885
4886 for (i = 0; i < MAX_NUMNODES; i++) {
4887 /* Start at @node */
4888 n = (node + i) % MAX_NUMNODES;
4889
4890 if (!nr_cpus_node(n))
4891 continue;
4892
4893 /* Skip already used nodes */
4894 if (test_bit(n, used_nodes))
4895 continue;
4896
4897 /* Simple min distance search */
4898 val = node_distance(node, n);
4899
4900 if (val < min_val) {
4901 min_val = val;
4902 best_node = n;
4903 }
4904 }
4905
4906 set_bit(best_node, used_nodes);
4907 return best_node;
4908}
4909
4910/**
4911 * sched_domain_node_span - get a cpumask for a node's sched_domain
4912 * @node: node whose cpumask we're constructing
4913 * @size: number of nodes to include in this span
4914 *
4915 * Given a node, construct a good cpumask for its sched_domain to span. It
4916 * should be one that prevents unnecessary balancing, but also spreads tasks
4917 * out optimally.
4918 */
4919static cpumask_t sched_domain_node_span(int node)
4920{
4921 int i;
4922 cpumask_t span, nodemask;
4923 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
4924
4925 cpus_clear(span);
4926 bitmap_zero(used_nodes, MAX_NUMNODES);
4927
4928 nodemask = node_to_cpumask(node);
4929 cpus_or(span, span, nodemask);
4930 set_bit(node, used_nodes);
4931
4932 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
4933 int next_node = find_next_best_node(node, used_nodes);
4934 nodemask = node_to_cpumask(next_node);
4935 cpus_or(span, span, nodemask);
4936 }
4937
4938 return span;
4939}
4940#endif
4941
4942/*
4943 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
4944 * can switch it on easily if needed.
4945 */
4873#ifdef CONFIG_SCHED_SMT 4946#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4947static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 4948static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
4891} 4964}
4892 4965
4893#ifdef CONFIG_NUMA 4966#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 4967/*
4905 * The domains setup code relies on siblings not spanning 4968 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 4969 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 4970 * gets dynamically allocated.
4908 */ 4971 */
4909static void check_sibling_maps(void) 4972static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 4973static struct sched_group *sched_group_nodes[MAX_NUMNODES];
4911 int i, j;
4912 4974
4913 for_each_online_cpu(i) { 4975static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 4976static struct sched_group sched_group_allnodes[MAX_NUMNODES];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 4977
4916 printk(KERN_INFO "warning: CPU %d siblings map " 4978static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 4979{
4918 "them.\n", i); 4980 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 4981}
4925#endif 4982#endif
4926 4983
@@ -4928,7 +4985,7 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 4985 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 4986 * to the individual cpus
4930 */ 4987 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 4988void build_sched_domains(const cpumask_t *cpu_map)
4932{ 4989{
4933 int i; 4990 int i;
4934 4991
@@ -4943,11 +5000,22 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5000 cpus_and(nodemask, nodemask, *cpu_map);
4944 5001
4945#ifdef CONFIG_NUMA 5002#ifdef CONFIG_NUMA
5003 if (num_online_cpus()
5004 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5005 sd = &per_cpu(allnodes_domains, i);
5006 *sd = SD_ALLNODES_INIT;
5007 sd->span = *cpu_map;
5008 group = cpu_to_allnodes_group(i);
5009 sd->groups = &sched_group_allnodes[group];
5010 p = sd;
5011 } else
5012 p = NULL;
5013
4946 sd = &per_cpu(node_domains, i); 5014 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5015 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5016 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5017 sd->parent = p;
5018 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5019#endif
4952 5020
4953 p = sd; 5021 p = sd;
@@ -4972,7 +5040,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5040
4973#ifdef CONFIG_SCHED_SMT 5041#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5042 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5043 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5044 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5045 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5046 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5065,74 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5065
4998#ifdef CONFIG_NUMA 5066#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5067 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5068 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5001 &cpu_to_node_group); 5069 &cpu_to_allnodes_group);
5070
5071 for (i = 0; i < MAX_NUMNODES; i++) {
5072 /* Set up node groups */
5073 struct sched_group *sg, *prev;
5074 cpumask_t nodemask = node_to_cpumask(i);
5075 cpumask_t domainspan;
5076 cpumask_t covered = CPU_MASK_NONE;
5077 int j;
5078
5079 cpus_and(nodemask, nodemask, *cpu_map);
5080 if (cpus_empty(nodemask))
5081 continue;
5082
5083 domainspan = sched_domain_node_span(i);
5084 cpus_and(domainspan, domainspan, *cpu_map);
5085
5086 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5087 sched_group_nodes[i] = sg;
5088 for_each_cpu_mask(j, nodemask) {
5089 struct sched_domain *sd;
5090 sd = &per_cpu(node_domains, j);
5091 sd->groups = sg;
5092 if (sd->groups == NULL) {
5093 /* Turn off balancing if we have no groups */
5094 sd->flags = 0;
5095 }
5096 }
5097 if (!sg) {
5098 printk(KERN_WARNING
5099 "Can not alloc domain group for node %d\n", i);
5100 continue;
5101 }
5102 sg->cpu_power = 0;
5103 sg->cpumask = nodemask;
5104 cpus_or(covered, covered, nodemask);
5105 prev = sg;
5106
5107 for (j = 0; j < MAX_NUMNODES; j++) {
5108 cpumask_t tmp, notcovered;
5109 int n = (i + j) % MAX_NUMNODES;
5110
5111 cpus_complement(notcovered, covered);
5112 cpus_and(tmp, notcovered, *cpu_map);
5113 cpus_and(tmp, tmp, domainspan);
5114 if (cpus_empty(tmp))
5115 break;
5116
5117 nodemask = node_to_cpumask(n);
5118 cpus_and(tmp, tmp, nodemask);
5119 if (cpus_empty(tmp))
5120 continue;
5121
5122 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5123 if (!sg) {
5124 printk(KERN_WARNING
5125 "Can not alloc domain group for node %d\n", j);
5126 break;
5127 }
5128 sg->cpu_power = 0;
5129 sg->cpumask = tmp;
5130 cpus_or(covered, covered, tmp);
5131 prev->next = sg;
5132 prev = sg;
5133 }
5134 prev->next = sched_group_nodes[i];
5135 }
5002#endif 5136#endif
5003 5137
5004 /* Calculate CPU power for physical packages and nodes */ 5138 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5151,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5151 sd->groups->cpu_power = power;
5018 5152
5019#ifdef CONFIG_NUMA 5153#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5154 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5155 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5156 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5157 (cpus_weight(sd->groups->cpumask)-1) / 10;
5158 sd->groups->cpu_power = power;
5024 } 5159 }
5025#endif 5160#endif
5026 } 5161 }
5027 5162
5163#ifdef CONFIG_NUMA
5164 for (i = 0; i < MAX_NUMNODES; i++) {
5165 struct sched_group *sg = sched_group_nodes[i];
5166 int j;
5167
5168 if (sg == NULL)
5169 continue;
5170next_sg:
5171 for_each_cpu_mask(j, sg->cpumask) {
5172 struct sched_domain *sd;
5173 int power;
5174
5175 sd = &per_cpu(phys_domains, j);
5176 if (j != first_cpu(sd->groups->cpumask)) {
5177 /*
5178 * Only add "power" once for each
5179 * physical package.
5180 */
5181 continue;
5182 }
5183 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5184 (cpus_weight(sd->groups->cpumask)-1) / 10;
5185
5186 sg->cpu_power += power;
5187 }
5188 sg = sg->next;
5189 if (sg != sched_group_nodes[i])
5190 goto next_sg;
5191 }
5192#endif
5193
5028 /* Attach the domains */ 5194 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5195 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5196 struct sched_domain *sd;
@@ -5039,13 +5205,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5205/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5206 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5207 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5208static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5209{
5044 cpumask_t cpu_default_map; 5210 cpumask_t cpu_default_map;
5045 5211
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5212 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5213 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5214 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5221,29 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5221
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5222static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5223{
5061 /* Do nothing: everything is statically allocated. */ 5224#ifdef CONFIG_NUMA
5062} 5225 int i;
5226 for (i = 0; i < MAX_NUMNODES; i++) {
5227 cpumask_t nodemask = node_to_cpumask(i);
5228 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5063 5229
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5230 cpus_and(nodemask, nodemask, *cpu_map);
5231 if (cpus_empty(nodemask))
5232 continue;
5233
5234 if (sg == NULL)
5235 continue;
5236 sg = sg->next;
5237next_sg:
5238 oldsg = sg;
5239 sg = sg->next;
5240 kfree(oldsg);
5241 if (oldsg != sched_group_nodes[i])
5242 goto next_sg;
5243 sched_group_nodes[i] = NULL;
5244 }
5245#endif
5246}
5065 5247
5066/* 5248/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5249 * Detach sched domains from a group of cpus specified in cpu_map