[PATCH] ia64 cpuset + build_sched_domains() mangles structures

I've already sent this to the maintainers, and this is now being sent to a larger community audience. I have fixed a problem with the ia64 version of build_sched_domains(), but a similar fix still needs to be made to the generic build_sched_domains() in kernel/sched.c. The "dynamic sched domains" functionality has recently been merged into 2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a. "isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched groups to separate away the CPUs in this cpu-exclusive cpuset from the remainder of the non-isolated CPUs. This allows the non-isolated CPUs to completely ignore the isolated CPUs when doing load-balancing. Unfortunately, build_sched_domains() expects that a sched domain will include all the CPUs of each node in the domain, i.e., that no node will belong in both an isolated cpuset and a non-isolated cpuset. Declaring a cpuset that violates this presumption will produce flawed data structures and will oops the kernel. To trigger the problem (on a NUMA system with >1 CPUs per node): cd /dev/cpuset mkdir newcpuset cd newcpuset echo 0 >cpus echo 0 >mems echo 1 >cpu_exclusive I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node). A similar shortcoming exists in the generic build_sched_domains() (in kernel/sched.c) for NUMA, and that needs to be fixed also. The fix involves dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for each invocation of build_sched_domains(), rather than using global arrays for these structures. Care must be taken to remember kmalloc() addresses so that arch_destroy_sched_domains() can properly kfree() the new dynamic structures. Signed-off-by: John Hawkes <hawkes@sgi.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: "Luck, Tony" <tony.luck@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: John Hawkes <hawkes@sgi.com> 2005-09-06 18:18:06 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-09-07 19:57:39 -0400
commit: f68f447e8389de9a62e3e80c3c5823cce484c2e5 (patch)
tree: 218ec8e11ecbfc730e248b3dae7ebbbe9fdc5da6 /arch/ia64
parent: 38f18527592756d24a12e84c0713e8c902ba7f15 (diff)
1 files changed, 69 insertions, 21 deletions
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index bbb8efe126b..e907109983f 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
 * gets dynamically allocated.
 */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 static int cpu_to_allnodes_group(int cpu)
 {
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
 void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+        struct sched_group **sched_group_nodes = NULL;
+        struct sched_group *sched_group_allnodes = NULL;
+        /*
+         * Allocate the per-node list of sched groups
+         */
+        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                           GFP_ATOMIC);
+        if (!sched_group_nodes) {
+                printk(KERN_WARNING "Can not alloc sched group node list\n");
+                return;
+        }
+        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
-                if (num_online_cpus()
+                if (cpus_weight(*cpu_map)
                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        if (!sched_group_allnodes) {
+                                sched_group_allnodes
+                                        = kmalloc(sizeof(struct sched_group)
+                                                        * MAX_NUMNODES,
+                                                  GFP_KERNEL);
+                                if (!sched_group_allnodes) {
+                                        printk(KERN_WARNING
+                                        "Can not alloc allnodes sched group\n");
+                                        break;
+                                }
+                                sched_group_allnodes_bycpu[i]
+                                                = sched_group_allnodes;
+                        }
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
@@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
        }
 #ifdef CONFIG_NUMA
-        init_sched_build_groups(sched_group_allnodes, *cpu_map,
+        if (sched_group_allnodes)
-                                &cpu_to_allnodes_group);
+                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                        &cpu_to_allnodes_group);
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
                int j;
                cpus_and(nodemask, nodemask, *cpu_map);
-                if (cpus_empty(nodemask))
+                if (cpus_empty(nodemask)) {
+                        sched_group_nodes[i] = NULL;
                        continue;
+                }
                domainspan = sched_domain_node_span(i);
                cpus_and(domainspan, domainspan, *cpu_map);
@@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
        int i;
-        for (i = 0; i < MAX_NUMNODES; i++) {
+        int cpu;
-                cpumask_t nodemask = node_to_cpumask(i);
-                struct sched_group *oldsg, *sg = sched_group_nodes[i];
-                cpus_and(nodemask, nodemask, *cpu_map);
+        for_each_cpu_mask(cpu, *cpu_map) {
-                if (cpus_empty(nodemask))
+                struct sched_group *sched_group_allnodes
-                        continue;
+                        = sched_group_allnodes_bycpu[cpu];
+                struct sched_group **sched_group_nodes
+                        = sched_group_nodes_bycpu[cpu];
-                if (sg == NULL)
+                if (sched_group_allnodes) {
+                        kfree(sched_group_allnodes);
+                        sched_group_allnodes_bycpu[cpu] = NULL;
+                }
+                if (!sched_group_nodes)
                        continue;
-                sg = sg->next;
+                for (i = 0; i < MAX_NUMNODES; i++) {
+                        cpumask_t nodemask = node_to_cpumask(i);
+                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                        cpus_and(nodemask, nodemask, *cpu_map);
+                        if (cpus_empty(nodemask))
+                                continue;
+                        if (sg == NULL)
+                                continue;
+                        sg = sg->next;
 next_sg:
-                oldsg = sg;
+                        oldsg = sg;
-                sg = sg->next;
+                        sg = sg->next;
-                kfree(oldsg);
+                        kfree(oldsg);
-                if (oldsg != sched_group_nodes[i])
+                        if (oldsg != sched_group_nodes[i])
-                        goto next_sg;
+                                goto next_sg;
-                sched_group_nodes[i] = NULL;
+                }
+                kfree(sched_group_nodes);
+                sched_group_nodes_bycpu[cpu] = NULL;
        }
 #endif
 }
author	John Hawkes <hawkes@sgi.com>	2005-09-06 18:18:06 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-09-07 19:57:39 -0400
commit	f68f447e8389de9a62e3e80c3c5823cce484c2e5 (patch)
tree	218ec8e11ecbfc730e248b3dae7ebbbe9fdc5da6 /arch/ia64
parent	38f18527592756d24a12e84c0713e8c902ba7f15 (diff)

diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c index bbb8efe126b..e907109983f 100644 --- a/arch/ia64/kernel/domain.c +++ b/arch/ia64/kernel/domain.c
@@ -120,10 +120,10 @@ static int cpu_to_phys_group(int cpu)
120	* gets dynamically allocated.	120	* gets dynamically allocated.
121	*/	121	*/
122	static DEFINE_PER_CPU(struct sched_domain, node_domains);	122	static DEFINE_PER_CPU(struct sched_domain, node_domains);
123	static struct sched_group *sched_group_nodes[MAX_NUMNODES];	123	static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
124		124
125	static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);	125	static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
126	static struct sched_group sched_group_allnodes[MAX_NUMNODES];	126	static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
127		127
128	static int cpu_to_allnodes_group(int cpu)	128	static int cpu_to_allnodes_group(int cpu)
129	{	129	{
@@ -138,6 +138,21 @@ static int cpu_to_allnodes_group(int cpu)
138	void build_sched_domains(const cpumask_t *cpu_map)	138	void build_sched_domains(const cpumask_t *cpu_map)
139	{	139	{
140	int i;	140	int i;
		141	#ifdef CONFIG_NUMA
		142	struct sched_group **sched_group_nodes = NULL;
		143	struct sched_group *sched_group_allnodes = NULL;
		144
		145	/*
		146	* Allocate the per-node list of sched groups
		147	*/
		148	sched_group_nodes = kmalloc(sizeof(struct sched_group)MAX_NUMNODES,
		149	GFP_ATOMIC);
		150	if (!sched_group_nodes) {
		151	printk(KERN_WARNING "Can not alloc sched group node list\n");
		152	return;
		153	}
		154	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
		155	#endif
141		156
142	/*	157	/*
143	* Set up domains for cpus specified by the cpu_map.	158	* Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
150	cpus_and(nodemask, nodemask, *cpu_map);	165	cpus_and(nodemask, nodemask, *cpu_map);
151		166
152	#ifdef CONFIG_NUMA	167	#ifdef CONFIG_NUMA
153	if (num_online_cpus()	168	if (cpus_weight(*cpu_map)
154	> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {	169	> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
		170	if (!sched_group_allnodes) {
		171	sched_group_allnodes
		172	= kmalloc(sizeof(struct sched_group)
		173	* MAX_NUMNODES,
		174	GFP_KERNEL);
		175	if (!sched_group_allnodes) {
		176	printk(KERN_WARNING
		177	"Can not alloc allnodes sched group\n");
		178	break;
		179	}
		180	sched_group_allnodes_bycpu[i]
		181	= sched_group_allnodes;
		182	}
155	sd = &per_cpu(allnodes_domains, i);	183	sd = &per_cpu(allnodes_domains, i);
156	*sd = SD_ALLNODES_INIT;	184	*sd = SD_ALLNODES_INIT;
157	sd->span = *cpu_map;	185	sd->span = *cpu_map;
@@ -214,8 +242,9 @@ void build_sched_domains(const cpumask_t *cpu_map)
214	}	242	}
215		243
216	#ifdef CONFIG_NUMA	244	#ifdef CONFIG_NUMA
217	init_sched_build_groups(sched_group_allnodes, *cpu_map,	245	if (sched_group_allnodes)
218	&cpu_to_allnodes_group);	246	init_sched_build_groups(sched_group_allnodes, *cpu_map,
		247	&cpu_to_allnodes_group);
219		248
220	for (i = 0; i < MAX_NUMNODES; i++) {	249	for (i = 0; i < MAX_NUMNODES; i++) {
221	/* Set up node groups */	250	/* Set up node groups */
@@ -226,8 +255,10 @@ void build_sched_domains(const cpumask_t *cpu_map)
226	int j;	255	int j;
227		256
228	cpus_and(nodemask, nodemask, *cpu_map);	257	cpus_and(nodemask, nodemask, *cpu_map);
229	if (cpus_empty(nodemask))	258	if (cpus_empty(nodemask)) {
		259	sched_group_nodes[i] = NULL;
230	continue;	260	continue;
		261	}
231		262
232	domainspan = sched_domain_node_span(i);	263	domainspan = sched_domain_node_span(i);
233	cpus_and(domainspan, domainspan, *cpu_map);	264	cpus_and(domainspan, domainspan, *cpu_map);
@@ -372,25 +403,42 @@ void arch_destroy_sched_domains(const cpumask_t *cpu_map)
372	{	403	{
373	#ifdef CONFIG_NUMA	404	#ifdef CONFIG_NUMA
374	int i;	405	int i;
375	for (i = 0; i < MAX_NUMNODES; i++) {	406	int cpu;
376	cpumask_t nodemask = node_to_cpumask(i);
377	struct sched_group oldsg, sg = sched_group_nodes[i];
378		407
379	cpus_and(nodemask, nodemask, *cpu_map);	408	for_each_cpu_mask(cpu, *cpu_map) {
380	if (cpus_empty(nodemask))	409	struct sched_group *sched_group_allnodes
381	continue;	410	= sched_group_allnodes_bycpu[cpu];
		411	struct sched_group **sched_group_nodes
		412	= sched_group_nodes_bycpu[cpu];
382		413
383	if (sg == NULL)	414	if (sched_group_allnodes) {
		415	kfree(sched_group_allnodes);
		416	sched_group_allnodes_bycpu[cpu] = NULL;
		417	}
		418
		419	if (!sched_group_nodes)
384	continue;	420	continue;
385	sg = sg->next;	421
		422	for (i = 0; i < MAX_NUMNODES; i++) {
		423	cpumask_t nodemask = node_to_cpumask(i);
		424	struct sched_group oldsg, sg = sched_group_nodes[i];
		425
		426	cpus_and(nodemask, nodemask, *cpu_map);
		427	if (cpus_empty(nodemask))
		428	continue;
		429
		430	if (sg == NULL)
		431	continue;
		432	sg = sg->next;
386	next_sg:	433	next_sg:
387	oldsg = sg;	434	oldsg = sg;
388	sg = sg->next;	435	sg = sg->next;
389	kfree(oldsg);	436	kfree(oldsg);
390	if (oldsg != sched_group_nodes[i])	437	if (oldsg != sched_group_nodes[i])
391	goto next_sg;	438	goto next_sg;
392	sched_group_nodes[i] = NULL;	439	}
		440	kfree(sched_group_nodes);
		441	sched_group_nodes_bycpu[cpu] = NULL;
393	}	442	}
394	#endif	443	#endif
395	}	444	}
396