diff options
author | Dinakar Guniguntala <dino@in.ibm.com> | 2005-06-25 17:57:33 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-25 19:24:45 -0400 |
commit | 1a20ff27ef75d866730ee796acd811a925af762f (patch) | |
tree | a9e6acd72db03cfec5fdaee8cfab231032216581 /kernel | |
parent | 37e4ab3f0cba13adf3535d373fd98e5ee47b5410 (diff) |
[PATCH] Dynamic sched domains: sched changes
The following patches add dynamic sched domains functionality that was
extensively discussed on lkml and lse-tech. I would like to see this added to
-mm
o The main advantage with this feature is that it ensures that the scheduler
load balacing code only balances against the cpus that are in the sched
domain as defined by an exclusive cpuset and not all of the cpus in the
system. This removes any overhead due to load balancing code trying to
pull tasks outside of the cpu exclusive cpuset only to be prevented by
the tasks' cpus_allowed mask.
o cpu exclusive cpusets are useful for servers running orthogonal
workloads such as RT applications requiring low latency and HPC
applications that are throughput sensitive
o It provides a new API partition_sched_domains in sched.c
that makes dynamic sched domains possible.
o cpu_exclusive cpusets sets are now associated with a sched domain.
Which means that the users can dynamically modify the sched domains
through the cpuset file system interface
o ia64 sched domain code has been updated to support this feature as well
o Currently, this does not support hotplug. (However some of my tests
indicate hotplug+preempt is currently broken)
o I have tested it extensively on x86.
o This should have very minimal impact on performance as none of
the fast paths are affected
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 132 |
1 files changed, 86 insertions, 46 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index d3d81b82e37..dee96b22635 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); | |||
262 | 262 | ||
263 | /* | 263 | /* |
264 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 264 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
265 | * See update_sched_domains: synchronize_kernel for details. | 265 | * See detach_destroy_domains: synchronize_sched for details. |
266 | * | 266 | * |
267 | * The domain tree of any CPU may only be accessed from within | 267 | * The domain tree of any CPU may only be accessed from within |
268 | * preempt-disabled sections. | 268 | * preempt-disabled sections. |
@@ -4624,7 +4624,7 @@ int __init migration_init(void) | |||
4624 | #endif | 4624 | #endif |
4625 | 4625 | ||
4626 | #ifdef CONFIG_SMP | 4626 | #ifdef CONFIG_SMP |
4627 | #define SCHED_DOMAIN_DEBUG | 4627 | #undef SCHED_DOMAIN_DEBUG |
4628 | #ifdef SCHED_DOMAIN_DEBUG | 4628 | #ifdef SCHED_DOMAIN_DEBUG |
4629 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 4629 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
4630 | { | 4630 | { |
@@ -4717,7 +4717,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
4717 | #define sched_domain_debug(sd, cpu) {} | 4717 | #define sched_domain_debug(sd, cpu) {} |
4718 | #endif | 4718 | #endif |
4719 | 4719 | ||
4720 | static int __devinit sd_degenerate(struct sched_domain *sd) | 4720 | static int sd_degenerate(struct sched_domain *sd) |
4721 | { | 4721 | { |
4722 | if (cpus_weight(sd->span) == 1) | 4722 | if (cpus_weight(sd->span) == 1) |
4723 | return 1; | 4723 | return 1; |
@@ -4740,7 +4740,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd) | |||
4740 | return 1; | 4740 | return 1; |
4741 | } | 4741 | } |
4742 | 4742 | ||
4743 | static int __devinit sd_parent_degenerate(struct sched_domain *sd, | 4743 | static int sd_parent_degenerate(struct sched_domain *sd, |
4744 | struct sched_domain *parent) | 4744 | struct sched_domain *parent) |
4745 | { | 4745 | { |
4746 | unsigned long cflags = sd->flags, pflags = parent->flags; | 4746 | unsigned long cflags = sd->flags, pflags = parent->flags; |
@@ -4772,7 +4772,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd, | |||
4772 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4772 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4773 | * hold the hotplug lock. | 4773 | * hold the hotplug lock. |
4774 | */ | 4774 | */ |
4775 | void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) | 4775 | void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4776 | { | 4776 | { |
4777 | runqueue_t *rq = cpu_rq(cpu); | 4777 | runqueue_t *rq = cpu_rq(cpu); |
4778 | struct sched_domain *tmp; | 4778 | struct sched_domain *tmp; |
@@ -4823,7 +4823,7 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
4823 | * covered by the given span, and will set each group's ->cpumask correctly, | 4823 | * covered by the given span, and will set each group's ->cpumask correctly, |
4824 | * and ->cpu_power to 0. | 4824 | * and ->cpu_power to 0. |
4825 | */ | 4825 | */ |
4826 | void __devinit init_sched_build_groups(struct sched_group groups[], | 4826 | void init_sched_build_groups(struct sched_group groups[], |
4827 | cpumask_t span, int (*group_fn)(int cpu)) | 4827 | cpumask_t span, int (*group_fn)(int cpu)) |
4828 | { | 4828 | { |
4829 | struct sched_group *first = NULL, *last = NULL; | 4829 | struct sched_group *first = NULL, *last = NULL; |
@@ -4859,13 +4859,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], | |||
4859 | 4859 | ||
4860 | 4860 | ||
4861 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4861 | #ifdef ARCH_HAS_SCHED_DOMAIN |
4862 | extern void __devinit arch_init_sched_domains(void); | 4862 | extern void build_sched_domains(const cpumask_t *cpu_map); |
4863 | extern void __devinit arch_destroy_sched_domains(void); | 4863 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); |
4864 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | ||
4864 | #else | 4865 | #else |
4865 | #ifdef CONFIG_SCHED_SMT | 4866 | #ifdef CONFIG_SCHED_SMT |
4866 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 4867 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
4867 | static struct sched_group sched_group_cpus[NR_CPUS]; | 4868 | static struct sched_group sched_group_cpus[NR_CPUS]; |
4868 | static int __devinit cpu_to_cpu_group(int cpu) | 4869 | static int cpu_to_cpu_group(int cpu) |
4869 | { | 4870 | { |
4870 | return cpu; | 4871 | return cpu; |
4871 | } | 4872 | } |
@@ -4873,7 +4874,7 @@ static int __devinit cpu_to_cpu_group(int cpu) | |||
4873 | 4874 | ||
4874 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 4875 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
4875 | static struct sched_group sched_group_phys[NR_CPUS]; | 4876 | static struct sched_group sched_group_phys[NR_CPUS]; |
4876 | static int __devinit cpu_to_phys_group(int cpu) | 4877 | static int cpu_to_phys_group(int cpu) |
4877 | { | 4878 | { |
4878 | #ifdef CONFIG_SCHED_SMT | 4879 | #ifdef CONFIG_SCHED_SMT |
4879 | return first_cpu(cpu_sibling_map[cpu]); | 4880 | return first_cpu(cpu_sibling_map[cpu]); |
@@ -4886,7 +4887,7 @@ static int __devinit cpu_to_phys_group(int cpu) | |||
4886 | 4887 | ||
4887 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 4888 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
4888 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | 4889 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; |
4889 | static int __devinit cpu_to_node_group(int cpu) | 4890 | static int cpu_to_node_group(int cpu) |
4890 | { | 4891 | { |
4891 | return cpu_to_node(cpu); | 4892 | return cpu_to_node(cpu); |
4892 | } | 4893 | } |
@@ -4917,39 +4918,28 @@ static void check_sibling_maps(void) | |||
4917 | #endif | 4918 | #endif |
4918 | 4919 | ||
4919 | /* | 4920 | /* |
4920 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 4921 | * Build sched domains for a given set of cpus and attach the sched domains |
4922 | * to the individual cpus | ||
4921 | */ | 4923 | */ |
4922 | static void __devinit arch_init_sched_domains(void) | 4924 | static void build_sched_domains(const cpumask_t *cpu_map) |
4923 | { | 4925 | { |
4924 | int i; | 4926 | int i; |
4925 | cpumask_t cpu_default_map; | ||
4926 | |||
4927 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4928 | check_sibling_maps(); | ||
4929 | #endif | ||
4930 | /* | ||
4931 | * Setup mask for cpus without special case scheduling requirements. | ||
4932 | * For now this just excludes isolated cpus, but could be used to | ||
4933 | * exclude other special cases in the future. | ||
4934 | */ | ||
4935 | cpus_complement(cpu_default_map, cpu_isolated_map); | ||
4936 | cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); | ||
4937 | 4927 | ||
4938 | /* | 4928 | /* |
4939 | * Set up domains. Isolated domains just stay on the NULL domain. | 4929 | * Set up domains for cpus specified by the cpu_map. |
4940 | */ | 4930 | */ |
4941 | for_each_cpu_mask(i, cpu_default_map) { | 4931 | for_each_cpu_mask(i, *cpu_map) { |
4942 | int group; | 4932 | int group; |
4943 | struct sched_domain *sd = NULL, *p; | 4933 | struct sched_domain *sd = NULL, *p; |
4944 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 4934 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
4945 | 4935 | ||
4946 | cpus_and(nodemask, nodemask, cpu_default_map); | 4936 | cpus_and(nodemask, nodemask, *cpu_map); |
4947 | 4937 | ||
4948 | #ifdef CONFIG_NUMA | 4938 | #ifdef CONFIG_NUMA |
4949 | sd = &per_cpu(node_domains, i); | 4939 | sd = &per_cpu(node_domains, i); |
4950 | group = cpu_to_node_group(i); | 4940 | group = cpu_to_node_group(i); |
4951 | *sd = SD_NODE_INIT; | 4941 | *sd = SD_NODE_INIT; |
4952 | sd->span = cpu_default_map; | 4942 | sd->span = *cpu_map; |
4953 | sd->groups = &sched_group_nodes[group]; | 4943 | sd->groups = &sched_group_nodes[group]; |
4954 | #endif | 4944 | #endif |
4955 | 4945 | ||
@@ -4967,7 +4957,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4967 | group = cpu_to_cpu_group(i); | 4957 | group = cpu_to_cpu_group(i); |
4968 | *sd = SD_SIBLING_INIT; | 4958 | *sd = SD_SIBLING_INIT; |
4969 | sd->span = cpu_sibling_map[i]; | 4959 | sd->span = cpu_sibling_map[i]; |
4970 | cpus_and(sd->span, sd->span, cpu_default_map); | 4960 | cpus_and(sd->span, sd->span, *cpu_map); |
4971 | sd->parent = p; | 4961 | sd->parent = p; |
4972 | sd->groups = &sched_group_cpus[group]; | 4962 | sd->groups = &sched_group_cpus[group]; |
4973 | #endif | 4963 | #endif |
@@ -4977,7 +4967,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4977 | /* Set up CPU (sibling) groups */ | 4967 | /* Set up CPU (sibling) groups */ |
4978 | for_each_online_cpu(i) { | 4968 | for_each_online_cpu(i) { |
4979 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 4969 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
4980 | cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); | 4970 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
4981 | if (i != first_cpu(this_sibling_map)) | 4971 | if (i != first_cpu(this_sibling_map)) |
4982 | continue; | 4972 | continue; |
4983 | 4973 | ||
@@ -4990,7 +4980,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4990 | for (i = 0; i < MAX_NUMNODES; i++) { | 4980 | for (i = 0; i < MAX_NUMNODES; i++) { |
4991 | cpumask_t nodemask = node_to_cpumask(i); | 4981 | cpumask_t nodemask = node_to_cpumask(i); |
4992 | 4982 | ||
4993 | cpus_and(nodemask, nodemask, cpu_default_map); | 4983 | cpus_and(nodemask, nodemask, *cpu_map); |
4994 | if (cpus_empty(nodemask)) | 4984 | if (cpus_empty(nodemask)) |
4995 | continue; | 4985 | continue; |
4996 | 4986 | ||
@@ -5000,12 +4990,12 @@ static void __devinit arch_init_sched_domains(void) | |||
5000 | 4990 | ||
5001 | #ifdef CONFIG_NUMA | 4991 | #ifdef CONFIG_NUMA |
5002 | /* Set up node groups */ | 4992 | /* Set up node groups */ |
5003 | init_sched_build_groups(sched_group_nodes, cpu_default_map, | 4993 | init_sched_build_groups(sched_group_nodes, *cpu_map, |
5004 | &cpu_to_node_group); | 4994 | &cpu_to_node_group); |
5005 | #endif | 4995 | #endif |
5006 | 4996 | ||
5007 | /* Calculate CPU power for physical packages and nodes */ | 4997 | /* Calculate CPU power for physical packages and nodes */ |
5008 | for_each_cpu_mask(i, cpu_default_map) { | 4998 | for_each_cpu_mask(i, *cpu_map) { |
5009 | int power; | 4999 | int power; |
5010 | struct sched_domain *sd; | 5000 | struct sched_domain *sd; |
5011 | #ifdef CONFIG_SCHED_SMT | 5001 | #ifdef CONFIG_SCHED_SMT |
@@ -5029,7 +5019,7 @@ static void __devinit arch_init_sched_domains(void) | |||
5029 | } | 5019 | } |
5030 | 5020 | ||
5031 | /* Attach the domains */ | 5021 | /* Attach the domains */ |
5032 | for_each_online_cpu(i) { | 5022 | for_each_cpu_mask(i, *cpu_map) { |
5033 | struct sched_domain *sd; | 5023 | struct sched_domain *sd; |
5034 | #ifdef CONFIG_SCHED_SMT | 5024 | #ifdef CONFIG_SCHED_SMT |
5035 | sd = &per_cpu(cpu_domains, i); | 5025 | sd = &per_cpu(cpu_domains, i); |
@@ -5039,16 +5029,71 @@ static void __devinit arch_init_sched_domains(void) | |||
5039 | cpu_attach_domain(sd, i); | 5029 | cpu_attach_domain(sd, i); |
5040 | } | 5030 | } |
5041 | } | 5031 | } |
5032 | /* | ||
5033 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
5034 | */ | ||
5035 | static void arch_init_sched_domains(cpumask_t *cpu_map) | ||
5036 | { | ||
5037 | cpumask_t cpu_default_map; | ||
5042 | 5038 | ||
5043 | #ifdef CONFIG_HOTPLUG_CPU | 5039 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) |
5044 | static void __devinit arch_destroy_sched_domains(void) | 5040 | check_sibling_maps(); |
5041 | #endif | ||
5042 | /* | ||
5043 | * Setup mask for cpus without special case scheduling requirements. | ||
5044 | * For now this just excludes isolated cpus, but could be used to | ||
5045 | * exclude other special cases in the future. | ||
5046 | */ | ||
5047 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | ||
5048 | |||
5049 | build_sched_domains(&cpu_default_map); | ||
5050 | } | ||
5051 | |||
5052 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | ||
5045 | { | 5053 | { |
5046 | /* Do nothing: everything is statically allocated. */ | 5054 | /* Do nothing: everything is statically allocated. */ |
5047 | } | 5055 | } |
5048 | #endif | ||
5049 | 5056 | ||
5050 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5057 | #endif /* ARCH_HAS_SCHED_DOMAIN */ |
5051 | 5058 | ||
5059 | /* | ||
5060 | * Detach sched domains from a group of cpus specified in cpu_map | ||
5061 | * These cpus will now be attached to the NULL domain | ||
5062 | */ | ||
5063 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) | ||
5064 | { | ||
5065 | int i; | ||
5066 | |||
5067 | for_each_cpu_mask(i, *cpu_map) | ||
5068 | cpu_attach_domain(NULL, i); | ||
5069 | synchronize_sched(); | ||
5070 | arch_destroy_sched_domains(cpu_map); | ||
5071 | } | ||
5072 | |||
5073 | /* | ||
5074 | * Partition sched domains as specified by the cpumasks below. | ||
5075 | * This attaches all cpus from the cpumasks to the NULL domain, | ||
5076 | * waits for a RCU quiescent period, recalculates sched | ||
5077 | * domain information and then attaches them back to the | ||
5078 | * correct sched domains | ||
5079 | * Call with hotplug lock held | ||
5080 | */ | ||
5081 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | ||
5082 | { | ||
5083 | cpumask_t change_map; | ||
5084 | |||
5085 | cpus_and(*partition1, *partition1, cpu_online_map); | ||
5086 | cpus_and(*partition2, *partition2, cpu_online_map); | ||
5087 | cpus_or(change_map, *partition1, *partition2); | ||
5088 | |||
5089 | /* Detach sched domains from all of the affected cpus */ | ||
5090 | detach_destroy_domains(&change_map); | ||
5091 | if (!cpus_empty(*partition1)) | ||
5092 | build_sched_domains(partition1); | ||
5093 | if (!cpus_empty(*partition2)) | ||
5094 | build_sched_domains(partition2); | ||
5095 | } | ||
5096 | |||
5052 | #ifdef CONFIG_HOTPLUG_CPU | 5097 | #ifdef CONFIG_HOTPLUG_CPU |
5053 | /* | 5098 | /* |
5054 | * Force a reinitialization of the sched domains hierarchy. The domains | 5099 | * Force a reinitialization of the sched domains hierarchy. The domains |
@@ -5059,15 +5104,10 @@ static void __devinit arch_destroy_sched_domains(void) | |||
5059 | static int update_sched_domains(struct notifier_block *nfb, | 5104 | static int update_sched_domains(struct notifier_block *nfb, |
5060 | unsigned long action, void *hcpu) | 5105 | unsigned long action, void *hcpu) |
5061 | { | 5106 | { |
5062 | int i; | ||
5063 | |||
5064 | switch (action) { | 5107 | switch (action) { |
5065 | case CPU_UP_PREPARE: | 5108 | case CPU_UP_PREPARE: |
5066 | case CPU_DOWN_PREPARE: | 5109 | case CPU_DOWN_PREPARE: |
5067 | for_each_online_cpu(i) | 5110 | detach_destroy_domains(&cpu_online_map); |
5068 | cpu_attach_domain(NULL, i); | ||
5069 | synchronize_kernel(); | ||
5070 | arch_destroy_sched_domains(); | ||
5071 | return NOTIFY_OK; | 5111 | return NOTIFY_OK; |
5072 | 5112 | ||
5073 | case CPU_UP_CANCELED: | 5113 | case CPU_UP_CANCELED: |
@@ -5083,7 +5123,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
5083 | } | 5123 | } |
5084 | 5124 | ||
5085 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 5125 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
5086 | arch_init_sched_domains(); | 5126 | arch_init_sched_domains(&cpu_online_map); |
5087 | 5127 | ||
5088 | return NOTIFY_OK; | 5128 | return NOTIFY_OK; |
5089 | } | 5129 | } |
@@ -5092,7 +5132,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
5092 | void __init sched_init_smp(void) | 5132 | void __init sched_init_smp(void) |
5093 | { | 5133 | { |
5094 | lock_cpu_hotplug(); | 5134 | lock_cpu_hotplug(); |
5095 | arch_init_sched_domains(); | 5135 | arch_init_sched_domains(&cpu_online_map); |
5096 | unlock_cpu_hotplug(); | 5136 | unlock_cpu_hotplug(); |
5097 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 5137 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
5098 | hotcpu_notifier(update_sched_domains, 0); | 5138 | hotcpu_notifier(update_sched_domains, 0); |