diff options
author | Paul Jackson <pj@sgi.com> | 2007-10-16 04:27:43 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:43:09 -0400 |
commit | 607717a65d92858fd925bec05baae4d142719f27 (patch) | |
tree | b7faea733fe3426881e63bc7549db9c97c8bdf59 /kernel | |
parent | 2ed6dc34f9ed39bb8e4c81ea1056f0ba56315841 (diff) |
cpuset: remove sched domain hooks from cpusets
Remove the cpuset hooks that defined sched domains depending on the setting
of the 'cpu_exclusive' flag.
The cpu_exclusive flag can only be set on a child if it is set on the
parent.
This made that flag painfully unsuitable for use as a flag defining a
partitioning of a system.
It was entirely unobvious to a cpuset user what partitioning of sched
domains they would be causing when they set that one cpu_exclusive bit on
one cpuset, because it depended on what CPUs were in the remainder of that
cpusets siblings and child cpusets, after subtracting out other
cpu_exclusive cpusets.
Furthermore, there was no way on production systems to query the
result.
Using the cpu_exclusive flag for this was simply wrong from the get go.
Fortunately, it was sufficiently borked that so far as I know, almost no
successful use has been made of this. One real time group did use it to
affectively isolate CPUs from any load balancing efforts. They are willing
to adapt to alternative mechanisms for this, such as someway to manipulate
the list of isolated CPUs on a running system. They can do without this
present cpu_exclusive based mechanism while we develop an alternative.
There is a real risk, to the best of my understanding, of users
accidentally setting up a partitioned scheduler domains, inhibiting desired
load balancing across all their CPUs, due to the nonobvious (from the
cpuset perspective) side affects of the cpu_exclusive flag.
Furthermore, since there was no way on a running system to see what one was
doing with sched domains, this change will be invisible to any using code.
Unless they have real insight to the scheduler load balancing choices, they
will be unable to detect that this change has been made in the kernel's
behaviour.
Initial discussion on lkml of this patch has generated much comment. My
(probably controversial) take on that discussion is that it has reached a
rough concensus that the current cpuset cpu_exclusive mechanism for
defining sched domains is borked. There is no concensus on the
replacement. But since we can remove this mechanism, and since its
continued presence risks causing unwanted partitioning of the schedulers
load balancing, we should remove it while we can, as we proceed to work the
replacement scheduler domain mechanisms.
Signed-off-by: Paul Jackson <pj@sgi.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 84 | ||||
-rw-r--r-- | kernel/sched.c | 29 |
2 files changed, 2 insertions, 111 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e196510aa40f..0864f4097930 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -755,68 +755,13 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
755 | } | 755 | } |
756 | 756 | ||
757 | /* | 757 | /* |
758 | * For a given cpuset cur, partition the system as follows | ||
759 | * a. All cpus in the parent cpuset's cpus_allowed that are not part of any | ||
760 | * exclusive child cpusets | ||
761 | * b. All cpus in the current cpuset's cpus_allowed that are not part of any | ||
762 | * exclusive child cpusets | ||
763 | * Build these two partitions by calling partition_sched_domains | ||
764 | * | ||
765 | * Call with manage_mutex held. May nest a call to the | ||
766 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | ||
767 | * Must not be called holding callback_mutex, because we must | ||
768 | * not call lock_cpu_hotplug() while holding callback_mutex. | ||
769 | */ | ||
770 | |||
771 | static void update_cpu_domains(struct cpuset *cur) | ||
772 | { | ||
773 | struct cpuset *c, *par = cur->parent; | ||
774 | cpumask_t pspan, cspan; | ||
775 | |||
776 | if (par == NULL || cpus_empty(cur->cpus_allowed)) | ||
777 | return; | ||
778 | |||
779 | /* | ||
780 | * Get all cpus from parent's cpus_allowed not part of exclusive | ||
781 | * children | ||
782 | */ | ||
783 | pspan = par->cpus_allowed; | ||
784 | list_for_each_entry(c, &par->children, sibling) { | ||
785 | if (is_cpu_exclusive(c)) | ||
786 | cpus_andnot(pspan, pspan, c->cpus_allowed); | ||
787 | } | ||
788 | if (!is_cpu_exclusive(cur)) { | ||
789 | cpus_or(pspan, pspan, cur->cpus_allowed); | ||
790 | if (cpus_equal(pspan, cur->cpus_allowed)) | ||
791 | return; | ||
792 | cspan = CPU_MASK_NONE; | ||
793 | } else { | ||
794 | if (cpus_empty(pspan)) | ||
795 | return; | ||
796 | cspan = cur->cpus_allowed; | ||
797 | /* | ||
798 | * Get all cpus from current cpuset's cpus_allowed not part | ||
799 | * of exclusive children | ||
800 | */ | ||
801 | list_for_each_entry(c, &cur->children, sibling) { | ||
802 | if (is_cpu_exclusive(c)) | ||
803 | cpus_andnot(cspan, cspan, c->cpus_allowed); | ||
804 | } | ||
805 | } | ||
806 | |||
807 | lock_cpu_hotplug(); | ||
808 | partition_sched_domains(&pspan, &cspan); | ||
809 | unlock_cpu_hotplug(); | ||
810 | } | ||
811 | |||
812 | /* | ||
813 | * Call with manage_mutex held. May take callback_mutex during call. | 758 | * Call with manage_mutex held. May take callback_mutex during call. |
814 | */ | 759 | */ |
815 | 760 | ||
816 | static int update_cpumask(struct cpuset *cs, char *buf) | 761 | static int update_cpumask(struct cpuset *cs, char *buf) |
817 | { | 762 | { |
818 | struct cpuset trialcs; | 763 | struct cpuset trialcs; |
819 | int retval, cpus_unchanged; | 764 | int retval; |
820 | 765 | ||
821 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 766 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
822 | if (cs == &top_cpuset) | 767 | if (cs == &top_cpuset) |
@@ -843,12 +788,9 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
843 | retval = validate_change(cs, &trialcs); | 788 | retval = validate_change(cs, &trialcs); |
844 | if (retval < 0) | 789 | if (retval < 0) |
845 | return retval; | 790 | return retval; |
846 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | ||
847 | mutex_lock(&callback_mutex); | 791 | mutex_lock(&callback_mutex); |
848 | cs->cpus_allowed = trialcs.cpus_allowed; | 792 | cs->cpus_allowed = trialcs.cpus_allowed; |
849 | mutex_unlock(&callback_mutex); | 793 | mutex_unlock(&callback_mutex); |
850 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | ||
851 | update_cpu_domains(cs); | ||
852 | return 0; | 794 | return 0; |
853 | } | 795 | } |
854 | 796 | ||
@@ -1085,7 +1027,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
1085 | { | 1027 | { |
1086 | int turning_on; | 1028 | int turning_on; |
1087 | struct cpuset trialcs; | 1029 | struct cpuset trialcs; |
1088 | int err, cpu_exclusive_changed; | 1030 | int err; |
1089 | 1031 | ||
1090 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | 1032 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); |
1091 | 1033 | ||
@@ -1098,14 +1040,10 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
1098 | err = validate_change(cs, &trialcs); | 1040 | err = validate_change(cs, &trialcs); |
1099 | if (err < 0) | 1041 | if (err < 0) |
1100 | return err; | 1042 | return err; |
1101 | cpu_exclusive_changed = | ||
1102 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | ||
1103 | mutex_lock(&callback_mutex); | 1043 | mutex_lock(&callback_mutex); |
1104 | cs->flags = trialcs.flags; | 1044 | cs->flags = trialcs.flags; |
1105 | mutex_unlock(&callback_mutex); | 1045 | mutex_unlock(&callback_mutex); |
1106 | 1046 | ||
1107 | if (cpu_exclusive_changed) | ||
1108 | update_cpu_domains(cs); | ||
1109 | return 0; | 1047 | return 0; |
1110 | } | 1048 | } |
1111 | 1049 | ||
@@ -1965,17 +1903,6 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
1965 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1903 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); |
1966 | } | 1904 | } |
1967 | 1905 | ||
1968 | /* | ||
1969 | * Locking note on the strange update_flag() call below: | ||
1970 | * | ||
1971 | * If the cpuset being removed is marked cpu_exclusive, then simulate | ||
1972 | * turning cpu_exclusive off, which will call update_cpu_domains(). | ||
1973 | * The lock_cpu_hotplug() call in update_cpu_domains() must not be | ||
1974 | * made while holding callback_mutex. Elsewhere the kernel nests | ||
1975 | * callback_mutex inside lock_cpu_hotplug() calls. So the reverse | ||
1976 | * nesting would risk an ABBA deadlock. | ||
1977 | */ | ||
1978 | |||
1979 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | 1906 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) |
1980 | { | 1907 | { |
1981 | struct cpuset *cs = dentry->d_fsdata; | 1908 | struct cpuset *cs = dentry->d_fsdata; |
@@ -1995,13 +1922,6 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1995 | mutex_unlock(&manage_mutex); | 1922 | mutex_unlock(&manage_mutex); |
1996 | return -EBUSY; | 1923 | return -EBUSY; |
1997 | } | 1924 | } |
1998 | if (is_cpu_exclusive(cs)) { | ||
1999 | int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); | ||
2000 | if (retval < 0) { | ||
2001 | mutex_unlock(&manage_mutex); | ||
2002 | return retval; | ||
2003 | } | ||
2004 | } | ||
2005 | parent = cs->parent; | 1925 | parent = cs->parent; |
2006 | mutex_lock(&callback_mutex); | 1926 | mutex_lock(&callback_mutex); |
2007 | set_bit(CS_REMOVED, &cs->flags); | 1927 | set_bit(CS_REMOVED, &cs->flags); |
diff --git a/kernel/sched.c b/kernel/sched.c index 78c8fbd373a3..0da2b2635c54 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -6348,35 +6348,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6348 | arch_destroy_sched_domains(cpu_map); | 6348 | arch_destroy_sched_domains(cpu_map); |
6349 | } | 6349 | } |
6350 | 6350 | ||
6351 | /* | ||
6352 | * Partition sched domains as specified by the cpumasks below. | ||
6353 | * This attaches all cpus from the cpumasks to the NULL domain, | ||
6354 | * waits for a RCU quiescent period, recalculates sched | ||
6355 | * domain information and then attaches them back to the | ||
6356 | * correct sched domains | ||
6357 | * Call with hotplug lock held | ||
6358 | */ | ||
6359 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | ||
6360 | { | ||
6361 | cpumask_t change_map; | ||
6362 | int err = 0; | ||
6363 | |||
6364 | cpus_and(*partition1, *partition1, cpu_online_map); | ||
6365 | cpus_and(*partition2, *partition2, cpu_online_map); | ||
6366 | cpus_or(change_map, *partition1, *partition2); | ||
6367 | |||
6368 | /* Detach sched domains from all of the affected cpus */ | ||
6369 | detach_destroy_domains(&change_map); | ||
6370 | if (!cpus_empty(*partition1)) | ||
6371 | err = build_sched_domains(partition1); | ||
6372 | if (!err && !cpus_empty(*partition2)) | ||
6373 | err = build_sched_domains(partition2); | ||
6374 | |||
6375 | register_sched_domain_sysctl(); | ||
6376 | |||
6377 | return err; | ||
6378 | } | ||
6379 | |||
6380 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6351 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6381 | static int arch_reinit_sched_domains(void) | 6352 | static int arch_reinit_sched_domains(void) |
6382 | { | 6353 | { |