aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2007-10-16 04:27:43 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:43:09 -0400
commit607717a65d92858fd925bec05baae4d142719f27 (patch)
treeb7faea733fe3426881e63bc7549db9c97c8bdf59 /kernel
parent2ed6dc34f9ed39bb8e4c81ea1056f0ba56315841 (diff)
cpuset: remove sched domain hooks from cpusets
Remove the cpuset hooks that defined sched domains depending on the setting of the 'cpu_exclusive' flag. The cpu_exclusive flag can only be set on a child if it is set on the parent. This made that flag painfully unsuitable for use as a flag defining a partitioning of a system. It was entirely unobvious to a cpuset user what partitioning of sched domains they would be causing when they set that one cpu_exclusive bit on one cpuset, because it depended on what CPUs were in the remainder of that cpusets siblings and child cpusets, after subtracting out other cpu_exclusive cpusets. Furthermore, there was no way on production systems to query the result. Using the cpu_exclusive flag for this was simply wrong from the get go. Fortunately, it was sufficiently borked that so far as I know, almost no successful use has been made of this. One real time group did use it to affectively isolate CPUs from any load balancing efforts. They are willing to adapt to alternative mechanisms for this, such as someway to manipulate the list of isolated CPUs on a running system. They can do without this present cpu_exclusive based mechanism while we develop an alternative. There is a real risk, to the best of my understanding, of users accidentally setting up a partitioned scheduler domains, inhibiting desired load balancing across all their CPUs, due to the nonobvious (from the cpuset perspective) side affects of the cpu_exclusive flag. Furthermore, since there was no way on a running system to see what one was doing with sched domains, this change will be invisible to any using code. Unless they have real insight to the scheduler load balancing choices, they will be unable to detect that this change has been made in the kernel's behaviour. Initial discussion on lkml of this patch has generated much comment. My (probably controversial) take on that discussion is that it has reached a rough concensus that the current cpuset cpu_exclusive mechanism for defining sched domains is borked. There is no concensus on the replacement. But since we can remove this mechanism, and since its continued presence risks causing unwanted partitioning of the schedulers load balancing, we should remove it while we can, as we proceed to work the replacement scheduler domain mechanisms. Signed-off-by: Paul Jackson <pj@sgi.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Christoph Lameter <clameter@engr.sgi.com> Cc: Dinakar Guniguntala <dino@in.ibm.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c84
-rw-r--r--kernel/sched.c29
2 files changed, 2 insertions, 111 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e196510aa40f..0864f4097930 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -755,68 +755,13 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
755} 755}
756 756
757/* 757/*
758 * For a given cpuset cur, partition the system as follows
759 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
760 * exclusive child cpusets
761 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
762 * exclusive child cpusets
763 * Build these two partitions by calling partition_sched_domains
764 *
765 * Call with manage_mutex held. May nest a call to the
766 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
767 * Must not be called holding callback_mutex, because we must
768 * not call lock_cpu_hotplug() while holding callback_mutex.
769 */
770
771static void update_cpu_domains(struct cpuset *cur)
772{
773 struct cpuset *c, *par = cur->parent;
774 cpumask_t pspan, cspan;
775
776 if (par == NULL || cpus_empty(cur->cpus_allowed))
777 return;
778
779 /*
780 * Get all cpus from parent's cpus_allowed not part of exclusive
781 * children
782 */
783 pspan = par->cpus_allowed;
784 list_for_each_entry(c, &par->children, sibling) {
785 if (is_cpu_exclusive(c))
786 cpus_andnot(pspan, pspan, c->cpus_allowed);
787 }
788 if (!is_cpu_exclusive(cur)) {
789 cpus_or(pspan, pspan, cur->cpus_allowed);
790 if (cpus_equal(pspan, cur->cpus_allowed))
791 return;
792 cspan = CPU_MASK_NONE;
793 } else {
794 if (cpus_empty(pspan))
795 return;
796 cspan = cur->cpus_allowed;
797 /*
798 * Get all cpus from current cpuset's cpus_allowed not part
799 * of exclusive children
800 */
801 list_for_each_entry(c, &cur->children, sibling) {
802 if (is_cpu_exclusive(c))
803 cpus_andnot(cspan, cspan, c->cpus_allowed);
804 }
805 }
806
807 lock_cpu_hotplug();
808 partition_sched_domains(&pspan, &cspan);
809 unlock_cpu_hotplug();
810}
811
812/*
813 * Call with manage_mutex held. May take callback_mutex during call. 758 * Call with manage_mutex held. May take callback_mutex during call.
814 */ 759 */
815 760
816static int update_cpumask(struct cpuset *cs, char *buf) 761static int update_cpumask(struct cpuset *cs, char *buf)
817{ 762{
818 struct cpuset trialcs; 763 struct cpuset trialcs;
819 int retval, cpus_unchanged; 764 int retval;
820 765
821 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 766 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
822 if (cs == &top_cpuset) 767 if (cs == &top_cpuset)
@@ -843,12 +788,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
843 retval = validate_change(cs, &trialcs); 788 retval = validate_change(cs, &trialcs);
844 if (retval < 0) 789 if (retval < 0)
845 return retval; 790 return retval;
846 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
847 mutex_lock(&callback_mutex); 791 mutex_lock(&callback_mutex);
848 cs->cpus_allowed = trialcs.cpus_allowed; 792 cs->cpus_allowed = trialcs.cpus_allowed;
849 mutex_unlock(&callback_mutex); 793 mutex_unlock(&callback_mutex);
850 if (is_cpu_exclusive(cs) && !cpus_unchanged)
851 update_cpu_domains(cs);
852 return 0; 794 return 0;
853} 795}
854 796
@@ -1085,7 +1027,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1085{ 1027{
1086 int turning_on; 1028 int turning_on;
1087 struct cpuset trialcs; 1029 struct cpuset trialcs;
1088 int err, cpu_exclusive_changed; 1030 int err;
1089 1031
1090 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 1032 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
1091 1033
@@ -1098,14 +1040,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1098 err = validate_change(cs, &trialcs); 1040 err = validate_change(cs, &trialcs);
1099 if (err < 0) 1041 if (err < 0)
1100 return err; 1042 return err;
1101 cpu_exclusive_changed =
1102 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
1103 mutex_lock(&callback_mutex); 1043 mutex_lock(&callback_mutex);
1104 cs->flags = trialcs.flags; 1044 cs->flags = trialcs.flags;
1105 mutex_unlock(&callback_mutex); 1045 mutex_unlock(&callback_mutex);
1106 1046
1107 if (cpu_exclusive_changed)
1108 update_cpu_domains(cs);
1109 return 0; 1047 return 0;
1110} 1048}
1111 1049
@@ -1965,17 +1903,6 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1965 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1903 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1966} 1904}
1967 1905
1968/*
1969 * Locking note on the strange update_flag() call below:
1970 *
1971 * If the cpuset being removed is marked cpu_exclusive, then simulate
1972 * turning cpu_exclusive off, which will call update_cpu_domains().
1973 * The lock_cpu_hotplug() call in update_cpu_domains() must not be
1974 * made while holding callback_mutex. Elsewhere the kernel nests
1975 * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
1976 * nesting would risk an ABBA deadlock.
1977 */
1978
1979static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1906static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1980{ 1907{
1981 struct cpuset *cs = dentry->d_fsdata; 1908 struct cpuset *cs = dentry->d_fsdata;
@@ -1995,13 +1922,6 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1995 mutex_unlock(&manage_mutex); 1922 mutex_unlock(&manage_mutex);
1996 return -EBUSY; 1923 return -EBUSY;
1997 } 1924 }
1998 if (is_cpu_exclusive(cs)) {
1999 int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
2000 if (retval < 0) {
2001 mutex_unlock(&manage_mutex);
2002 return retval;
2003 }
2004 }
2005 parent = cs->parent; 1925 parent = cs->parent;
2006 mutex_lock(&callback_mutex); 1926 mutex_lock(&callback_mutex);
2007 set_bit(CS_REMOVED, &cs->flags); 1927 set_bit(CS_REMOVED, &cs->flags);
diff --git a/kernel/sched.c b/kernel/sched.c
index 78c8fbd373a3..0da2b2635c54 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6348,35 +6348,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6348 arch_destroy_sched_domains(cpu_map); 6348 arch_destroy_sched_domains(cpu_map);
6349} 6349}
6350 6350
6351/*
6352 * Partition sched domains as specified by the cpumasks below.
6353 * This attaches all cpus from the cpumasks to the NULL domain,
6354 * waits for a RCU quiescent period, recalculates sched
6355 * domain information and then attaches them back to the
6356 * correct sched domains
6357 * Call with hotplug lock held
6358 */
6359int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6360{
6361 cpumask_t change_map;
6362 int err = 0;
6363
6364 cpus_and(*partition1, *partition1, cpu_online_map);
6365 cpus_and(*partition2, *partition2, cpu_online_map);
6366 cpus_or(change_map, *partition1, *partition2);
6367
6368 /* Detach sched domains from all of the affected cpus */
6369 detach_destroy_domains(&change_map);
6370 if (!cpus_empty(*partition1))
6371 err = build_sched_domains(partition1);
6372 if (!err && !cpus_empty(*partition2))
6373 err = build_sched_domains(partition2);
6374
6375 register_sched_domain_sysctl();
6376
6377 return err;
6378}
6379
6380#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6351#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6381static int arch_reinit_sched_domains(void) 6352static int arch_reinit_sched_domains(void)
6382{ 6353{