cpuset sched_load_balance flag

Add a new per-cpuset flag called 'sched_load_balance'. When enabled in a cpuset (the default value) it tells the kernel scheduler that the scheduler should provide the normal load balancing on the CPUs in that cpuset, sometimes moving tasks from one CPU to a second CPU if the second CPU is less loaded and if that task is allowed to run there. When disabled (write "0" to the file) then it tells the kernel scheduler that load balancing is not required for the CPUs in that cpuset. Now even if this flag is disabled for some cpuset, the kernel may still have to load balance some or all the CPUs in that cpuset, if some overlapping cpuset has its sched_load_balance flag enabled. If there are some CPUs that are not in any cpuset whose sched_load_balance flag is enabled, the kernel scheduler will not load balance tasks to those CPUs. Moreover the kernel will partition the 'sched domains' (non-overlapping sets of CPUs over which load balancing is attempted) into the finest granularity partition that it can find, while still keeping any two CPUs that are in the same shed_load_balance enabled cpuset in the same element of the partition. This serves two purposes: 1) It provides a mechanism for real time isolation of some CPUs, and 2) it can be used to improve performance on systems with many CPUs by supporting configurations in which load balancing is not done across all CPUs at once, but rather only done in several smaller disjoint sets of CPUs. This mechanism replaces the earlier overloading of the per-cpuset flag 'cpu_exclusive', which overloading was removed in an earlier patch: cpuset-remove-sched-domain-hooks-from-cpusets See further the Documentation and comments in the code itself. [akpm@linux-foundation.org: don't be weird] Signed-off-by: Paul Jackson <pj@sgi.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Jackson <pj@sgi.com> 2007-10-19 02:40:20 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:41 -0400
commit: 029190c515f15f512ac85de8fc686d4dbd0ae731 (patch)
tree: a946f9223d17e945141fef81f94a75b38e2cc6ef /kernel/sched.c
parent: 2f2a3a46fcafa7a12d61454f67f932dfe7d84c60 (diff)
1 files changed, 81 insertions, 14 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5d5e107ebc4e..39d6354af489 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6376,26 +6376,31 @@ error:
        return -ENOMEM;
 #endif
 }
+static cpumask_t *doms_cur;     /* current sched domains */
+static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask_t) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask_t fallback_doms.
+ */
+static cpumask_t fallback_doms;
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
 */
 static int arch_init_sched_domains(const cpumask_t *cpu_map)
 {
-        cpumask_t cpu_default_map;
+        ndoms_cur = 1;
-        int err;
+        doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+        if (!doms_cur)
-        /*
+                doms_cur = &fallback_doms;
-         * Setup mask for cpus without special case scheduling requirements.
+        cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
-         * For now this just excludes isolated cpus, but could be used to
-         * exclude other special cases in the future.
-         */
-        cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-        err = build_sched_domains(&cpu_default_map);
        register_sched_domain_sysctl();
+        return build_sched_domains(doms_cur);
-        return err;
 }
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6419,6 +6424,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        arch_destroy_sched_domains(cpu_map);
 }
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks.  This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.)  We should setup one
+ * sched domain for each mask.  CPUs not in any of the cpumasks will
+ * not be load balanced.  If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be kmalloc'd.  This routine takes
+ * ownership of it and will kfree it when done with it.  If the caller
+ * failed the kmalloc call, then it can pass in doms_new == NULL,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms'.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
+{
+        int i, j;
+        if (doms_new == NULL) {
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+        }
+        /* Destroy deleted domains */
+        for (i = 0; i < ndoms_cur; i++) {
+                for (j = 0; j < ndoms_new; j++) {
+                        if (cpus_equal(doms_cur[i], doms_new[j]))
+                                goto match1;
+                }
+                /* no match - a current sched domain not in new doms_new[] */
+                detach_destroy_domains(doms_cur + i);
+match1:
+                ;
+        }
+        /* Build new domains */
+        for (i = 0; i < ndoms_new; i++) {
+                for (j = 0; j < ndoms_cur; j++) {
+                        if (cpus_equal(doms_new[i], doms_cur[j]))
+                                goto match2;
+                }
+                /* no match - add a new doms_new */
+                build_sched_domains(doms_new + i);
+match2:
+                ;
+        }
+        /* Remember the new sched domains */
+        if (doms_cur != &fallback_doms)
+                kfree(doms_cur);
+        doms_cur = doms_new;
+        ndoms_cur = ndoms_new;
+}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 static int arch_reinit_sched_domains(void)
 {
author	Paul Jackson <pj@sgi.com>	2007-10-19 02:40:20 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:41 -0400
commit	029190c515f15f512ac85de8fc686d4dbd0ae731 (patch)
tree	a946f9223d17e945141fef81f94a75b38e2cc6ef /kernel/sched.c
parent	2f2a3a46fcafa7a12d61454f67f932dfe7d84c60 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 5d5e107ebc4e..39d6354af489 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -6376,26 +6376,31 @@ error:
6376	return -ENOMEM;	6376	return -ENOMEM;
6377	#endif	6377	#endif
6378	}	6378	}
		6379
		6380	static cpumask_t doms_cur; / current sched domains */
		6381	static int ndoms_cur; /* number of sched domains in 'doms_cur' */
		6382
		6383	/*
		6384	* Special case: If a kmalloc of a doms_cur partition (array of
		6385	* cpumask_t) fails, then fallback to a single sched domain,
		6386	* as determined by the single cpumask_t fallback_doms.
		6387	*/
		6388	static cpumask_t fallback_doms;
		6389
6379	/*	6390	/*
6380	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	6391	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
		6392	* For now this just excludes isolated cpus, but could be used to
		6393	* exclude other special cases in the future.
6381	*/	6394	*/
6382	static int arch_init_sched_domains(const cpumask_t *cpu_map)	6395	static int arch_init_sched_domains(const cpumask_t *cpu_map)
6383	{	6396	{
6384	cpumask_t cpu_default_map;	6397	ndoms_cur = 1;
6385	int err;	6398	doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
6386		6399	if (!doms_cur)
6387	/*	6400	doms_cur = &fallback_doms;
6388	* Setup mask for cpus without special case scheduling requirements.	6401	cpus_andnot(doms_cur, cpu_map, cpu_isolated_map);
6389	* For now this just excludes isolated cpus, but could be used to
6390	* exclude other special cases in the future.
6391	*/
6392	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6393
6394	err = build_sched_domains(&cpu_default_map);
6395
6396	register_sched_domain_sysctl();	6402	register_sched_domain_sysctl();
6397		6403	return build_sched_domains(doms_cur);
6398	return err;
6399	}	6404	}
6400		6405
6401	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)	6406	static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
@@ -6419,6 +6424,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6419	arch_destroy_sched_domains(cpu_map);	6424	arch_destroy_sched_domains(cpu_map);
6420	}	6425	}
6421		6426
		6427	/*
		6428	* Partition sched domains as specified by the 'ndoms_new'
		6429	* cpumasks in the array doms_new[] of cpumasks. This compares
		6430	* doms_new[] to the current sched domain partitioning, doms_cur[].
		6431	* It destroys each deleted domain and builds each new domain.
		6432	*
		6433	* 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
		6434	* The masks don't intersect (don't overlap.) We should setup one
		6435	* sched domain for each mask. CPUs not in any of the cpumasks will
		6436	* not be load balanced. If the same cpumask appears both in the
		6437	* current 'doms_cur' domains and in the new 'doms_new', we can leave
		6438	* it as it is.
		6439	*
		6440	* The passed in 'doms_new' should be kmalloc'd. This routine takes
		6441	* ownership of it and will kfree it when done with it. If the caller
		6442	* failed the kmalloc call, then it can pass in doms_new == NULL,
		6443	* and partition_sched_domains() will fallback to the single partition
		6444	* 'fallback_doms'.
		6445	*
		6446	* Call with hotplug lock held
		6447	*/
		6448	void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
		6449	{
		6450	int i, j;
		6451
		6452	if (doms_new == NULL) {
		6453	ndoms_new = 1;
		6454	doms_new = &fallback_doms;
		6455	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
		6456	}
		6457
		6458	/* Destroy deleted domains */
		6459	for (i = 0; i < ndoms_cur; i++) {
		6460	for (j = 0; j < ndoms_new; j++) {
		6461	if (cpus_equal(doms_cur[i], doms_new[j]))
		6462	goto match1;
		6463	}
		6464	/* no match - a current sched domain not in new doms_new[] */
		6465	detach_destroy_domains(doms_cur + i);
		6466	match1:
		6467	;
		6468	}
		6469
		6470	/* Build new domains */
		6471	for (i = 0; i < ndoms_new; i++) {
		6472	for (j = 0; j < ndoms_cur; j++) {
		6473	if (cpus_equal(doms_new[i], doms_cur[j]))
		6474	goto match2;
		6475	}
		6476	/* no match - add a new doms_new */
		6477	build_sched_domains(doms_new + i);
		6478	match2:
		6479	;
		6480	}
		6481
		6482	/* Remember the new sched domains */
		6483	if (doms_cur != &fallback_doms)
		6484	kfree(doms_cur);
		6485	doms_cur = doms_new;
		6486	ndoms_cur = ndoms_new;
		6487	}
		6488
6422	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	6489	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
6423	static int arch_reinit_sched_domains(void)	6490	static int arch_reinit_sched_domains(void)
6424	{	6491	{