aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohn Hawkes <hawkes@sgi.com>2005-09-06 18:18:06 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-07 19:57:39 -0400
commitf68f447e8389de9a62e3e80c3c5823cce484c2e5 (patch)
tree218ec8e11ecbfc730e248b3dae7ebbbe9fdc5da6
parent38f18527592756d24a12e84c0713e8c902ba7f15 (diff)
[PATCH] ia64 cpuset + build_sched_domains() mangles structures
I've already sent this to the maintainers, and this is now being sent to a larger community audience. I have fixed a problem with the ia64 version of build_sched_domains(), but a similar fix still needs to be made to the generic build_sched_domains() in kernel/sched.c. The "dynamic sched domains" functionality has recently been merged into 2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a. "isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched groups to separate away the CPUs in this cpu-exclusive cpuset from the remainder of the non-isolated CPUs. This allows the non-isolated CPUs to completely ignore the isolated CPUs when doing load-balancing. Unfortunately, build_sched_domains() expects that a sched domain will include all the CPUs of each node in the domain, i.e., that no node will belong in both an isolated cpuset and a non-isolated cpuset. Declaring a cpuset that violates this presumption will produce flawed data structures and will oops the kernel. To trigger the problem (on a NUMA system with >1 CPUs per node): cd /dev/cpuset mkdir newcpuset cd newcpuset echo 0 >cpus echo 0 >mems echo 1 >cpu_exclusive I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node). A similar shortcoming exists in the generic build_sched_domains() (in kernel/sched.c) for NUMA, and that needs to be fixed also. The fix involves dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for each invocation of build_sched_domains(), rather than using global arrays for these structures. Care must be taken to remember kmalloc() addresses so that arch_destroy_sched_domains() can properly kfree() the new dynamic structures. Signed-off-by: John Hawkes <hawkes@sgi.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/ia64/kernel/domain.c90
1 files changed, 69 insertions, 21 deletions
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index bbb8efe126b7..e907109983f1 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
120 * gets dynamically allocated. 120 * gets dynamically allocated.
121 */ 121 */
122static DEFINE_PER_CPU(struct sched_domain, node_domains); 122static DEFINE_PER_CPU(struct sched_domain, node_domains);
123static struct sched_group *sched_group_nodes[MAX_NUMNODES]; 123static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
124 124
125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 125static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126static struct sched_group sched_group_allnodes[MAX_NUMNODES]; 126static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
127 127
128static int cpu_to_allnodes_group(int cpu) 128static int cpu_to_allnodes_group(int cpu)
129{ 129{
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
138void build_sched_domains(const cpumask_t *cpu_map) 138void build_sched_domains(const cpumask_t *cpu_map)
139{ 139{
140 int i; 140 int i;
141#ifdef CONFIG_NUMA
142 struct sched_group **sched_group_nodes = NULL;
143 struct sched_group *sched_group_allnodes = NULL;
144
145 /*
146 * Allocate the per-node list of sched groups
147 */
148 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
149 GFP_ATOMIC);
150 if (!sched_group_nodes) {
151 printk(KERN_WARNING "Can not alloc sched group node list\n");
152 return;
153 }
154 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
155#endif
141 156
142 /* 157 /*
143 * Set up domains for cpus specified by the cpu_map. 158 * Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
150 cpus_and(nodemask, nodemask, *cpu_map); 165 cpus_and(nodemask, nodemask, *cpu_map);
151 166
152#ifdef CONFIG_NUMA 167#ifdef CONFIG_NUMA
153 if (num_online_cpus() 168 if (cpus_weight(*cpu_map)
154 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 169 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
170 if (!sched_group_allnodes) {
171 sched_group_allnodes
172 = kmalloc(sizeof(struct sched_group)
173 * MAX_NUMNODES,
174 GFP_KERNEL);
175 if (!sched_group_allnodes) {
176 printk(KERN_WARNING
177 "Can not alloc allnodes sched group\n");
178 break;
179 }
180 sched_group_allnodes_bycpu[i]
181 = sched_group_allnodes;
182 }
155 sd = &per_cpu(allnodes_domains, i); 183 sd = &per_cpu(allnodes_domains, i);
156 *sd = SD_ALLNODES_INIT; 184 *sd = SD_ALLNODES_INIT;
157 sd->span = *cpu_map; 185 sd->span = *cpu_map;
@@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
214 } 242 }
215 243
216#ifdef CONFIG_NUMA 244#ifdef CONFIG_NUMA
217 init_sched_build_groups(sched_group_allnodes, *cpu_map, 245 if (sched_group_allnodes)
218 &cpu_to_allnodes_group); 246 init_sched_build_groups(sched_group_allnodes, *cpu_map,
247 &cpu_to_allnodes_group);
219 248
220 for (i = 0; i < MAX_NUMNODES; i++) { 249 for (i = 0; i < MAX_NUMNODES; i++) {
221 /* Set up node groups */ 250 /* Set up node groups */
@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
226 int j; 255 int j;
227 256
228 cpus_and(nodemask, nodemask, *cpu_map); 257 cpus_and(nodemask, nodemask, *cpu_map);
229 if (cpus_empty(nodemask)) 258 if (cpus_empty(nodemask)) {
259 sched_group_nodes[i] = NULL;
230 continue; 260 continue;
261 }
231 262
232 domainspan = sched_domain_node_span(i); 263 domainspan = sched_domain_node_span(i);
233 cpus_and(domainspan, domainspan, *cpu_map); 264 cpus_and(domainspan, domainspan, *cpu_map);
@@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
372{ 403{
373#ifdef CONFIG_NUMA 404#ifdef CONFIG_NUMA
374 int i; 405 int i;
375 for (i = 0; i < MAX_NUMNODES; i++) { 406 int cpu;
376 cpumask_t nodemask = node_to_cpumask(i);
377 struct sched_group *oldsg, *sg = sched_group_nodes[i];
378 407
379 cpus_and(nodemask, nodemask, *cpu_map); 408 for_each_cpu_mask(cpu, *cpu_map) {
380 if (cpus_empty(nodemask)) 409 struct sched_group *sched_group_allnodes
381 continue; 410 = sched_group_allnodes_bycpu[cpu];
411 struct sched_group **sched_group_nodes
412 = sched_group_nodes_bycpu[cpu];
382 413
383 if (sg == NULL) 414 if (sched_group_allnodes) {
415 kfree(sched_group_allnodes);
416 sched_group_allnodes_bycpu[cpu] = NULL;
417 }
418
419 if (!sched_group_nodes)
384 continue; 420 continue;
385 sg = sg->next; 421
422 for (i = 0; i < MAX_NUMNODES; i++) {
423 cpumask_t nodemask = node_to_cpumask(i);
424 struct sched_group *oldsg, *sg = sched_group_nodes[i];
425
426 cpus_and(nodemask, nodemask, *cpu_map);
427 if (cpus_empty(nodemask))
428 continue;
429
430 if (sg == NULL)
431 continue;
432 sg = sg->next;
386next_sg: 433next_sg:
387 oldsg = sg; 434 oldsg = sg;
388 sg = sg->next; 435 sg = sg->next;
389 kfree(oldsg); 436 kfree(oldsg);
390 if (oldsg != sched_group_nodes[i]) 437 if (oldsg != sched_group_nodes[i])
391 goto next_sg; 438 goto next_sg;
392 sched_group_nodes[i] = NULL; 439 }
440 kfree(sched_group_nodes);
441 sched_group_nodes_bycpu[cpu] = NULL;
393 } 442 }
394#endif 443#endif
395} 444}
396