diff options
author | John Hawkes <hawkes@sgi.com> | 2005-09-06 18:18:06 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-07 19:57:39 -0400 |
commit | f68f447e8389de9a62e3e80c3c5823cce484c2e5 (patch) | |
tree | 218ec8e11ecbfc730e248b3dae7ebbbe9fdc5da6 /arch/ia64 | |
parent | 38f18527592756d24a12e84c0713e8c902ba7f15 (diff) |
[PATCH] ia64 cpuset + build_sched_domains() mangles structures
I've already sent this to the maintainers, and this is now being sent to a
larger community audience. I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.
The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs. This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.
Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset. Declaring a
cpuset that violates this presumption will produce flawed data structures
and will oops the kernel.
To trigger the problem (on a NUMA system with >1 CPUs per node):
cd /dev/cpuset
mkdir newcpuset
cd newcpuset
echo 0 >cpus
echo 0 >mems
echo 1 >cpu_exclusive
I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also. The fix
involves dynamically allocating sched_group_nodes[] and
sched_group_allnodes[] for each invocation of build_sched_domains(), rather
than using global arrays for these structures. Care must be taken to
remember kmalloc() addresses so that arch_destroy_sched_domains() can
properly kfree() the new dynamic structures.
Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/ia64')
-rw-r--r-- | arch/ia64/kernel/domain.c | 90 |
1 files changed, 69 insertions, 21 deletions
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c index bbb8efe126b7..e907109983f1 100644 --- a/arch/ia64/kernel/domain.c +++ b/arch/ia64/kernel/domain.c | |||
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu) | |||
120 | * gets dynamically allocated. | 120 | * gets dynamically allocated. |
121 | */ | 121 | */ |
122 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 122 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
123 | static struct sched_group *sched_group_nodes[MAX_NUMNODES]; | 123 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
124 | 124 | ||
125 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 125 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
126 | static struct sched_group sched_group_allnodes[MAX_NUMNODES]; | 126 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; |
127 | 127 | ||
128 | static int cpu_to_allnodes_group(int cpu) | 128 | static int cpu_to_allnodes_group(int cpu) |
129 | { | 129 | { |
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu) | |||
138 | void build_sched_domains(const cpumask_t *cpu_map) | 138 | void build_sched_domains(const cpumask_t *cpu_map) |
139 | { | 139 | { |
140 | int i; | 140 | int i; |
141 | #ifdef CONFIG_NUMA | ||
142 | struct sched_group **sched_group_nodes = NULL; | ||
143 | struct sched_group *sched_group_allnodes = NULL; | ||
144 | |||
145 | /* | ||
146 | * Allocate the per-node list of sched groups | ||
147 | */ | ||
148 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | ||
149 | GFP_ATOMIC); | ||
150 | if (!sched_group_nodes) { | ||
151 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
152 | return; | ||
153 | } | ||
154 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
155 | #endif | ||
141 | 156 | ||
142 | /* | 157 | /* |
143 | * Set up domains for cpus specified by the cpu_map. | 158 | * Set up domains for cpus specified by the cpu_map. |
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
150 | cpus_and(nodemask, nodemask, *cpu_map); | 165 | cpus_and(nodemask, nodemask, *cpu_map); |
151 | 166 | ||
152 | #ifdef CONFIG_NUMA | 167 | #ifdef CONFIG_NUMA |
153 | if (num_online_cpus() | 168 | if (cpus_weight(*cpu_map) |
154 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 169 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
170 | if (!sched_group_allnodes) { | ||
171 | sched_group_allnodes | ||
172 | = kmalloc(sizeof(struct sched_group) | ||
173 | * MAX_NUMNODES, | ||
174 | GFP_KERNEL); | ||
175 | if (!sched_group_allnodes) { | ||
176 | printk(KERN_WARNING | ||
177 | "Can not alloc allnodes sched group\n"); | ||
178 | break; | ||
179 | } | ||
180 | sched_group_allnodes_bycpu[i] | ||
181 | = sched_group_allnodes; | ||
182 | } | ||
155 | sd = &per_cpu(allnodes_domains, i); | 183 | sd = &per_cpu(allnodes_domains, i); |
156 | *sd = SD_ALLNODES_INIT; | 184 | *sd = SD_ALLNODES_INIT; |
157 | sd->span = *cpu_map; | 185 | sd->span = *cpu_map; |
@@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
214 | } | 242 | } |
215 | 243 | ||
216 | #ifdef CONFIG_NUMA | 244 | #ifdef CONFIG_NUMA |
217 | init_sched_build_groups(sched_group_allnodes, *cpu_map, | 245 | if (sched_group_allnodes) |
218 | &cpu_to_allnodes_group); | 246 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
247 | &cpu_to_allnodes_group); | ||
219 | 248 | ||
220 | for (i = 0; i < MAX_NUMNODES; i++) { | 249 | for (i = 0; i < MAX_NUMNODES; i++) { |
221 | /* Set up node groups */ | 250 | /* Set up node groups */ |
@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
226 | int j; | 255 | int j; |
227 | 256 | ||
228 | cpus_and(nodemask, nodemask, *cpu_map); | 257 | cpus_and(nodemask, nodemask, *cpu_map); |
229 | if (cpus_empty(nodemask)) | 258 | if (cpus_empty(nodemask)) { |
259 | sched_group_nodes[i] = NULL; | ||
230 | continue; | 260 | continue; |
261 | } | ||
231 | 262 | ||
232 | domainspan = sched_domain_node_span(i); | 263 | domainspan = sched_domain_node_span(i); |
233 | cpus_and(domainspan, domainspan, *cpu_map); | 264 | cpus_and(domainspan, domainspan, *cpu_map); |
@@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map) | |||
372 | { | 403 | { |
373 | #ifdef CONFIG_NUMA | 404 | #ifdef CONFIG_NUMA |
374 | int i; | 405 | int i; |
375 | for (i = 0; i < MAX_NUMNODES; i++) { | 406 | int cpu; |
376 | cpumask_t nodemask = node_to_cpumask(i); | ||
377 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
378 | 407 | ||
379 | cpus_and(nodemask, nodemask, *cpu_map); | 408 | for_each_cpu_mask(cpu, *cpu_map) { |
380 | if (cpus_empty(nodemask)) | 409 | struct sched_group *sched_group_allnodes |
381 | continue; | 410 | = sched_group_allnodes_bycpu[cpu]; |
411 | struct sched_group **sched_group_nodes | ||
412 | = sched_group_nodes_bycpu[cpu]; | ||
382 | 413 | ||
383 | if (sg == NULL) | 414 | if (sched_group_allnodes) { |
415 | kfree(sched_group_allnodes); | ||
416 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
417 | } | ||
418 | |||
419 | if (!sched_group_nodes) | ||
384 | continue; | 420 | continue; |
385 | sg = sg->next; | 421 | |
422 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
423 | cpumask_t nodemask = node_to_cpumask(i); | ||
424 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
425 | |||
426 | cpus_and(nodemask, nodemask, *cpu_map); | ||
427 | if (cpus_empty(nodemask)) | ||
428 | continue; | ||
429 | |||
430 | if (sg == NULL) | ||
431 | continue; | ||
432 | sg = sg->next; | ||
386 | next_sg: | 433 | next_sg: |
387 | oldsg = sg; | 434 | oldsg = sg; |
388 | sg = sg->next; | 435 | sg = sg->next; |
389 | kfree(oldsg); | 436 | kfree(oldsg); |
390 | if (oldsg != sched_group_nodes[i]) | 437 | if (oldsg != sched_group_nodes[i]) |
391 | goto next_sg; | 438 | goto next_sg; |
392 | sched_group_nodes[i] = NULL; | 439 | } |
440 | kfree(sched_group_nodes); | ||
441 | sched_group_nodes_bycpu[cpu] = NULL; | ||
393 | } | 442 | } |
394 | #endif | 443 | #endif |
395 | } | 444 | } |
396 | |||