aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDinakar Guniguntala <dino@in.ibm.com>2005-06-25 17:57:33 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-25 19:24:45 -0400
commit1a20ff27ef75d866730ee796acd811a925af762f (patch)
treea9e6acd72db03cfec5fdaee8cfab231032216581
parent37e4ab3f0cba13adf3535d373fd98e5ee47b5410 (diff)
[PATCH] Dynamic sched domains: sched changes
The following patches add dynamic sched domains functionality that was extensively discussed on lkml and lse-tech. I would like to see this added to -mm o The main advantage with this feature is that it ensures that the scheduler load balacing code only balances against the cpus that are in the sched domain as defined by an exclusive cpuset and not all of the cpus in the system. This removes any overhead due to load balancing code trying to pull tasks outside of the cpu exclusive cpuset only to be prevented by the tasks' cpus_allowed mask. o cpu exclusive cpusets are useful for servers running orthogonal workloads such as RT applications requiring low latency and HPC applications that are throughput sensitive o It provides a new API partition_sched_domains in sched.c that makes dynamic sched domains possible. o cpu_exclusive cpusets sets are now associated with a sched domain. Which means that the users can dynamically modify the sched domains through the cpuset file system interface o ia64 sched domain code has been updated to support this feature as well o Currently, this does not support hotplug. (However some of my tests indicate hotplug+preempt is currently broken) o I have tested it extensively on x86. o This should have very minimal impact on performance as none of the fast paths are affected Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com> Acked-by: Paul Jackson <pj@sgi.com> Acked-by: Nick Piggin <nickpiggin@yahoo.com.au> Acked-by: Matthew Dobson <colpatch@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/sched.c132
2 files changed, 88 insertions, 46 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edb2c69a8873..98c109e4f43d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -539,6 +539,8 @@ struct sched_domain {
539#endif 539#endif
540}; 540};
541 541
542extern void partition_sched_domains(cpumask_t *partition1,
543 cpumask_t *partition2);
542#ifdef ARCH_HAS_SCHED_DOMAIN 544#ifdef ARCH_HAS_SCHED_DOMAIN
543/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */ 545/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
544extern cpumask_t cpu_isolated_map; 546extern cpumask_t cpu_isolated_map;
diff --git a/kernel/sched.c b/kernel/sched.c
index d3d81b82e378..dee96b22635e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/* 263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See update_sched_domains: synchronize_kernel for details. 265 * See detach_destroy_domains: synchronize_sched for details.
266 * 266 *
267 * The domain tree of any CPU may only be accessed from within 267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections. 268 * preempt-disabled sections.
@@ -4624,7 +4624,7 @@ int __init migration_init(void)
4624#endif 4624#endif
4625 4625
4626#ifdef CONFIG_SMP 4626#ifdef CONFIG_SMP
4627#define SCHED_DOMAIN_DEBUG 4627#undef SCHED_DOMAIN_DEBUG
4628#ifdef SCHED_DOMAIN_DEBUG 4628#ifdef SCHED_DOMAIN_DEBUG
4629static void sched_domain_debug(struct sched_domain *sd, int cpu) 4629static void sched_domain_debug(struct sched_domain *sd, int cpu)
4630{ 4630{
@@ -4717,7 +4717,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4717#define sched_domain_debug(sd, cpu) {} 4717#define sched_domain_debug(sd, cpu) {}
4718#endif 4718#endif
4719 4719
4720static int __devinit sd_degenerate(struct sched_domain *sd) 4720static int sd_degenerate(struct sched_domain *sd)
4721{ 4721{
4722 if (cpus_weight(sd->span) == 1) 4722 if (cpus_weight(sd->span) == 1)
4723 return 1; 4723 return 1;
@@ -4740,7 +4740,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd)
4740 return 1; 4740 return 1;
4741} 4741}
4742 4742
4743static int __devinit sd_parent_degenerate(struct sched_domain *sd, 4743static int sd_parent_degenerate(struct sched_domain *sd,
4744 struct sched_domain *parent) 4744 struct sched_domain *parent)
4745{ 4745{
4746 unsigned long cflags = sd->flags, pflags = parent->flags; 4746 unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4772,7 +4772,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
4772 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4772 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4773 * hold the hotplug lock. 4773 * hold the hotplug lock.
4774 */ 4774 */
4775void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4775void cpu_attach_domain(struct sched_domain *sd, int cpu)
4776{ 4776{
4777 runqueue_t *rq = cpu_rq(cpu); 4777 runqueue_t *rq = cpu_rq(cpu);
4778 struct sched_domain *tmp; 4778 struct sched_domain *tmp;
@@ -4823,7 +4823,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4823 * covered by the given span, and will set each group's ->cpumask correctly, 4823 * covered by the given span, and will set each group's ->cpumask correctly,
4824 * and ->cpu_power to 0. 4824 * and ->cpu_power to 0.
4825 */ 4825 */
4826void __devinit init_sched_build_groups(struct sched_group groups[], 4826void init_sched_build_groups(struct sched_group groups[],
4827 cpumask_t span, int (*group_fn)(int cpu)) 4827 cpumask_t span, int (*group_fn)(int cpu))
4828{ 4828{
4829 struct sched_group *first = NULL, *last = NULL; 4829 struct sched_group *first = NULL, *last = NULL;
@@ -4859,13 +4859,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4859 4859
4860 4860
4861#ifdef ARCH_HAS_SCHED_DOMAIN 4861#ifdef ARCH_HAS_SCHED_DOMAIN
4862extern void __devinit arch_init_sched_domains(void); 4862extern void build_sched_domains(const cpumask_t *cpu_map);
4863extern void __devinit arch_destroy_sched_domains(void); 4863extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4864extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4864#else 4865#else
4865#ifdef CONFIG_SCHED_SMT 4866#ifdef CONFIG_SCHED_SMT
4866static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4867static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4867static struct sched_group sched_group_cpus[NR_CPUS]; 4868static struct sched_group sched_group_cpus[NR_CPUS];
4868static int __devinit cpu_to_cpu_group(int cpu) 4869static int cpu_to_cpu_group(int cpu)
4869{ 4870{
4870 return cpu; 4871 return cpu;
4871} 4872}
@@ -4873,7 +4874,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4873 4874
4874static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4875static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4875static struct sched_group sched_group_phys[NR_CPUS]; 4876static struct sched_group sched_group_phys[NR_CPUS];
4876static int __devinit cpu_to_phys_group(int cpu) 4877static int cpu_to_phys_group(int cpu)
4877{ 4878{
4878#ifdef CONFIG_SCHED_SMT 4879#ifdef CONFIG_SCHED_SMT
4879 return first_cpu(cpu_sibling_map[cpu]); 4880 return first_cpu(cpu_sibling_map[cpu]);
@@ -4886,7 +4887,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4886 4887
4887static DEFINE_PER_CPU(struct sched_domain, node_domains); 4888static DEFINE_PER_CPU(struct sched_domain, node_domains);
4888static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4889static struct sched_group sched_group_nodes[MAX_NUMNODES];
4889static int __devinit cpu_to_node_group(int cpu) 4890static int cpu_to_node_group(int cpu)
4890{ 4891{
4891 return cpu_to_node(cpu); 4892 return cpu_to_node(cpu);
4892} 4893}
@@ -4917,39 +4918,28 @@ static void check_sibling_maps(void)
4917#endif 4918#endif
4918 4919
4919/* 4920/*
4920 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4921 * Build sched domains for a given set of cpus and attach the sched domains
4922 * to the individual cpus
4921 */ 4923 */
4922static void __devinit arch_init_sched_domains(void) 4924static void build_sched_domains(const cpumask_t *cpu_map)
4923{ 4925{
4924 int i; 4926 int i;
4925 cpumask_t cpu_default_map;
4926
4927#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4928 check_sibling_maps();
4929#endif
4930 /*
4931 * Setup mask for cpus without special case scheduling requirements.
4932 * For now this just excludes isolated cpus, but could be used to
4933 * exclude other special cases in the future.
4934 */
4935 cpus_complement(cpu_default_map, cpu_isolated_map);
4936 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4937 4927
4938 /* 4928 /*
4939 * Set up domains. Isolated domains just stay on the NULL domain. 4929 * Set up domains for cpus specified by the cpu_map.
4940 */ 4930 */
4941 for_each_cpu_mask(i, cpu_default_map) { 4931 for_each_cpu_mask(i, *cpu_map) {
4942 int group; 4932 int group;
4943 struct sched_domain *sd = NULL, *p; 4933 struct sched_domain *sd = NULL, *p;
4944 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4934 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4945 4935
4946 cpus_and(nodemask, nodemask, cpu_default_map); 4936 cpus_and(nodemask, nodemask, *cpu_map);
4947 4937
4948#ifdef CONFIG_NUMA 4938#ifdef CONFIG_NUMA
4949 sd = &per_cpu(node_domains, i); 4939 sd = &per_cpu(node_domains, i);
4950 group = cpu_to_node_group(i); 4940 group = cpu_to_node_group(i);
4951 *sd = SD_NODE_INIT; 4941 *sd = SD_NODE_INIT;
4952 sd->span = cpu_default_map; 4942 sd->span = *cpu_map;
4953 sd->groups = &sched_group_nodes[group]; 4943 sd->groups = &sched_group_nodes[group];
4954#endif 4944#endif
4955 4945
@@ -4967,7 +4957,7 @@ static void __devinit arch_init_sched_domains(void)
4967 group = cpu_to_cpu_group(i); 4957 group = cpu_to_cpu_group(i);
4968 *sd = SD_SIBLING_INIT; 4958 *sd = SD_SIBLING_INIT;
4969 sd->span = cpu_sibling_map[i]; 4959 sd->span = cpu_sibling_map[i];
4970 cpus_and(sd->span, sd->span, cpu_default_map); 4960 cpus_and(sd->span, sd->span, *cpu_map);
4971 sd->parent = p; 4961 sd->parent = p;
4972 sd->groups = &sched_group_cpus[group]; 4962 sd->groups = &sched_group_cpus[group];
4973#endif 4963#endif
@@ -4977,7 +4967,7 @@ static void __devinit arch_init_sched_domains(void)
4977 /* Set up CPU (sibling) groups */ 4967 /* Set up CPU (sibling) groups */
4978 for_each_online_cpu(i) { 4968 for_each_online_cpu(i) {
4979 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4969 cpumask_t this_sibling_map = cpu_sibling_map[i];
4980 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4970 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4981 if (i != first_cpu(this_sibling_map)) 4971 if (i != first_cpu(this_sibling_map))
4982 continue; 4972 continue;
4983 4973
@@ -4990,7 +4980,7 @@ static void __devinit arch_init_sched_domains(void)
4990 for (i = 0; i < MAX_NUMNODES; i++) { 4980 for (i = 0; i < MAX_NUMNODES; i++) {
4991 cpumask_t nodemask = node_to_cpumask(i); 4981 cpumask_t nodemask = node_to_cpumask(i);
4992 4982
4993 cpus_and(nodemask, nodemask, cpu_default_map); 4983 cpus_and(nodemask, nodemask, *cpu_map);
4994 if (cpus_empty(nodemask)) 4984 if (cpus_empty(nodemask))
4995 continue; 4985 continue;
4996 4986
@@ -5000,12 +4990,12 @@ static void __devinit arch_init_sched_domains(void)
5000 4990
5001#ifdef CONFIG_NUMA 4991#ifdef CONFIG_NUMA
5002 /* Set up node groups */ 4992 /* Set up node groups */
5003 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4993 init_sched_build_groups(sched_group_nodes, *cpu_map,
5004 &cpu_to_node_group); 4994 &cpu_to_node_group);
5005#endif 4995#endif
5006 4996
5007 /* Calculate CPU power for physical packages and nodes */ 4997 /* Calculate CPU power for physical packages and nodes */
5008 for_each_cpu_mask(i, cpu_default_map) { 4998 for_each_cpu_mask(i, *cpu_map) {
5009 int power; 4999 int power;
5010 struct sched_domain *sd; 5000 struct sched_domain *sd;
5011#ifdef CONFIG_SCHED_SMT 5001#ifdef CONFIG_SCHED_SMT
@@ -5029,7 +5019,7 @@ static void __devinit arch_init_sched_domains(void)
5029 } 5019 }
5030 5020
5031 /* Attach the domains */ 5021 /* Attach the domains */
5032 for_each_online_cpu(i) { 5022 for_each_cpu_mask(i, *cpu_map) {
5033 struct sched_domain *sd; 5023 struct sched_domain *sd;
5034#ifdef CONFIG_SCHED_SMT 5024#ifdef CONFIG_SCHED_SMT
5035 sd = &per_cpu(cpu_domains, i); 5025 sd = &per_cpu(cpu_domains, i);
@@ -5039,16 +5029,71 @@ static void __devinit arch_init_sched_domains(void)
5039 cpu_attach_domain(sd, i); 5029 cpu_attach_domain(sd, i);
5040 } 5030 }
5041} 5031}
5032/*
5033 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5034 */
5035static void arch_init_sched_domains(cpumask_t *cpu_map)
5036{
5037 cpumask_t cpu_default_map;
5042 5038
5043#ifdef CONFIG_HOTPLUG_CPU 5039#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5044static void __devinit arch_destroy_sched_domains(void) 5040 check_sibling_maps();
5041#endif
5042 /*
5043 * Setup mask for cpus without special case scheduling requirements.
5044 * For now this just excludes isolated cpus, but could be used to
5045 * exclude other special cases in the future.
5046 */
5047 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5048
5049 build_sched_domains(&cpu_default_map);
5050}
5051
5052static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5045{ 5053{
5046 /* Do nothing: everything is statically allocated. */ 5054 /* Do nothing: everything is statically allocated. */
5047} 5055}
5048#endif
5049 5056
5050#endif /* ARCH_HAS_SCHED_DOMAIN */ 5057#endif /* ARCH_HAS_SCHED_DOMAIN */
5051 5058
5059/*
5060 * Detach sched domains from a group of cpus specified in cpu_map
5061 * These cpus will now be attached to the NULL domain
5062 */
5063static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5064{
5065 int i;
5066
5067 for_each_cpu_mask(i, *cpu_map)
5068 cpu_attach_domain(NULL, i);
5069 synchronize_sched();
5070 arch_destroy_sched_domains(cpu_map);
5071}
5072
5073/*
5074 * Partition sched domains as specified by the cpumasks below.
5075 * This attaches all cpus from the cpumasks to the NULL domain,
5076 * waits for a RCU quiescent period, recalculates sched
5077 * domain information and then attaches them back to the
5078 * correct sched domains
5079 * Call with hotplug lock held
5080 */
5081void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5082{
5083 cpumask_t change_map;
5084
5085 cpus_and(*partition1, *partition1, cpu_online_map);
5086 cpus_and(*partition2, *partition2, cpu_online_map);
5087 cpus_or(change_map, *partition1, *partition2);
5088
5089 /* Detach sched domains from all of the affected cpus */
5090 detach_destroy_domains(&change_map);
5091 if (!cpus_empty(*partition1))
5092 build_sched_domains(partition1);
5093 if (!cpus_empty(*partition2))
5094 build_sched_domains(partition2);
5095}
5096
5052#ifdef CONFIG_HOTPLUG_CPU 5097#ifdef CONFIG_HOTPLUG_CPU
5053/* 5098/*
5054 * Force a reinitialization of the sched domains hierarchy. The domains 5099 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -5059,15 +5104,10 @@ static void __devinit arch_destroy_sched_domains(void)
5059static int update_sched_domains(struct notifier_block *nfb, 5104static int update_sched_domains(struct notifier_block *nfb,
5060 unsigned long action, void *hcpu) 5105 unsigned long action, void *hcpu)
5061{ 5106{
5062 int i;
5063
5064 switch (action) { 5107 switch (action) {
5065 case CPU_UP_PREPARE: 5108 case CPU_UP_PREPARE:
5066 case CPU_DOWN_PREPARE: 5109 case CPU_DOWN_PREPARE:
5067 for_each_online_cpu(i) 5110 detach_destroy_domains(&cpu_online_map);
5068 cpu_attach_domain(NULL, i);
5069 synchronize_kernel();
5070 arch_destroy_sched_domains();
5071 return NOTIFY_OK; 5111 return NOTIFY_OK;
5072 5112
5073 case CPU_UP_CANCELED: 5113 case CPU_UP_CANCELED:
@@ -5083,7 +5123,7 @@ static int update_sched_domains(struct notifier_block *nfb,
5083 } 5123 }
5084 5124
5085 /* The hotplug lock is already held by cpu_up/cpu_down */ 5125 /* The hotplug lock is already held by cpu_up/cpu_down */
5086 arch_init_sched_domains(); 5126 arch_init_sched_domains(&cpu_online_map);
5087 5127
5088 return NOTIFY_OK; 5128 return NOTIFY_OK;
5089} 5129}
@@ -5092,7 +5132,7 @@ static int update_sched_domains(struct notifier_block *nfb,
5092void __init sched_init_smp(void) 5132void __init sched_init_smp(void)
5093{ 5133{
5094 lock_cpu_hotplug(); 5134 lock_cpu_hotplug();
5095 arch_init_sched_domains(); 5135 arch_init_sched_domains(&cpu_online_map);
5096 unlock_cpu_hotplug(); 5136 unlock_cpu_hotplug();
5097 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5137 /* XXX: Theoretical race here - CPU may be hotplugged now */
5098 hotcpu_notifier(update_sched_domains, 0); 5138 hotcpu_notifier(update_sched_domains, 0);