aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c339
1 files changed, 285 insertions, 54 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..9508527845df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4779,7 +4779,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4780 * hold the hotplug lock.
4781 */ 4781 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4782static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4783{
4784 runqueue_t *rq = cpu_rq(cpu); 4784 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4785 struct sched_domain *tmp;
@@ -4802,7 +4802,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4802}
4803 4803
4804/* cpus with isolated domains */ 4804/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4805static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4806
4807/* Setup the mask of cpus configured for isolated domains */ 4807/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4808static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4830,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4830 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4831 * and ->cpu_power to 0.
4832 */ 4832 */
4833void init_sched_build_groups(struct sched_group groups[], 4833static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4834 int (*group_fn)(int cpu))
4835{ 4835{
4836 struct sched_group *first = NULL, *last = NULL; 4836 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4837 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4864,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4864 last->next = first;
4865} 4865}
4866 4866
4867#define SD_NODES_PER_DOMAIN 16
4867 4868
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4869#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4870/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4871 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4872 * @node: node whose sched_domain we're building
4872#else 4873 * @used_nodes: nodes already in the sched_domain
4874 *
4875 * Find the next node to include in a given scheduling domain. Simply
4876 * finds the closest node not already in the @used_nodes map.
4877 *
4878 * Should use nodemask_t.
4879 */
4880static int find_next_best_node(int node, unsigned long *used_nodes)
4881{
4882 int i, n, val, min_val, best_node = 0;
4883
4884 min_val = INT_MAX;
4885
4886 for (i = 0; i < MAX_NUMNODES; i++) {
4887 /* Start at @node */
4888 n = (node + i) % MAX_NUMNODES;
4889
4890 if (!nr_cpus_node(n))
4891 continue;
4892
4893 /* Skip already used nodes */
4894 if (test_bit(n, used_nodes))
4895 continue;
4896
4897 /* Simple min distance search */
4898 val = node_distance(node, n);
4899
4900 if (val < min_val) {
4901 min_val = val;
4902 best_node = n;
4903 }
4904 }
4905
4906 set_bit(best_node, used_nodes);
4907 return best_node;
4908}
4909
4910/**
4911 * sched_domain_node_span - get a cpumask for a node's sched_domain
4912 * @node: node whose cpumask we're constructing
4913 * @size: number of nodes to include in this span
4914 *
4915 * Given a node, construct a good cpumask for its sched_domain to span. It
4916 * should be one that prevents unnecessary balancing, but also spreads tasks
4917 * out optimally.
4918 */
4919static cpumask_t sched_domain_node_span(int node)
4920{
4921 int i;
4922 cpumask_t span, nodemask;
4923 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
4924
4925 cpus_clear(span);
4926 bitmap_zero(used_nodes, MAX_NUMNODES);
4927
4928 nodemask = node_to_cpumask(node);
4929 cpus_or(span, span, nodemask);
4930 set_bit(node, used_nodes);
4931
4932 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
4933 int next_node = find_next_best_node(node, used_nodes);
4934 nodemask = node_to_cpumask(next_node);
4935 cpus_or(span, span, nodemask);
4936 }
4937
4938 return span;
4939}
4940#endif
4941
4942/*
4943 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
4944 * can switch it on easily if needed.
4945 */
4873#ifdef CONFIG_SCHED_SMT 4946#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4947static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 4948static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4964,20 @@ static int cpu_to_phys_group(int cpu)
4891} 4964}
4892 4965
4893#ifdef CONFIG_NUMA 4966#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 4967/*
4905 * The domains setup code relies on siblings not spanning 4968 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 4969 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 4970 * gets dynamically allocated.
4908 */ 4971 */
4909static void check_sibling_maps(void) 4972static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 4973static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911 int i, j;
4912 4974
4913 for_each_online_cpu(i) { 4975static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 4976static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 4977
4916 printk(KERN_INFO "warning: CPU %d siblings map " 4978static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 4979{
4918 "them.\n", i); 4980 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 4981}
4925#endif 4982#endif
4926 4983
@@ -4928,9 +4985,24 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 4985 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 4986 * to the individual cpus
4930 */ 4987 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 4988void build_sched_domains(const cpumask_t *cpu_map)
4932{ 4989{
4933 int i; 4990 int i;
4991#ifdef CONFIG_NUMA
4992 struct sched_group **sched_group_nodes = NULL;
4993 struct sched_group *sched_group_allnodes = NULL;
4994
4995 /*
4996 * Allocate the per-node list of sched groups
4997 */
4998 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
4999 GFP_ATOMIC);
5000 if (!sched_group_nodes) {
5001 printk(KERN_WARNING "Can not alloc sched group node list\n");
5002 return;
5003 }
5004 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5005#endif
4934 5006
4935 /* 5007 /*
4936 * Set up domains for cpus specified by the cpu_map. 5008 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5015,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5015 cpus_and(nodemask, nodemask, *cpu_map);
4944 5016
4945#ifdef CONFIG_NUMA 5017#ifdef CONFIG_NUMA
5018 if (cpus_weight(*cpu_map)
5019 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5020 if (!sched_group_allnodes) {
5021 sched_group_allnodes
5022 = kmalloc(sizeof(struct sched_group)
5023 * MAX_NUMNODES,
5024 GFP_KERNEL);
5025 if (!sched_group_allnodes) {
5026 printk(KERN_WARNING
5027 "Can not alloc allnodes sched group\n");
5028 break;
5029 }
5030 sched_group_allnodes_bycpu[i]
5031 = sched_group_allnodes;
5032 }
5033 sd = &per_cpu(allnodes_domains, i);
5034 *sd = SD_ALLNODES_INIT;
5035 sd->span = *cpu_map;
5036 group = cpu_to_allnodes_group(i);
5037 sd->groups = &sched_group_allnodes[group];
5038 p = sd;
5039 } else
5040 p = NULL;
5041
4946 sd = &per_cpu(node_domains, i); 5042 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5043 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5044 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5045 sd->parent = p;
5046 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5047#endif
4952 5048
4953 p = sd; 5049 p = sd;
@@ -4972,7 +5068,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5068
4973#ifdef CONFIG_SCHED_SMT 5069#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5070 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5071 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5072 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5073 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5074 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5093,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5093
4998#ifdef CONFIG_NUMA 5094#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5095 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5096 if (sched_group_allnodes)
5001 &cpu_to_node_group); 5097 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5098 &cpu_to_allnodes_group);
5099
5100 for (i = 0; i < MAX_NUMNODES; i++) {
5101 /* Set up node groups */
5102 struct sched_group *sg, *prev;
5103 cpumask_t nodemask = node_to_cpumask(i);
5104 cpumask_t domainspan;
5105 cpumask_t covered = CPU_MASK_NONE;
5106 int j;
5107
5108 cpus_and(nodemask, nodemask, *cpu_map);
5109 if (cpus_empty(nodemask)) {
5110 sched_group_nodes[i] = NULL;
5111 continue;
5112 }
5113
5114 domainspan = sched_domain_node_span(i);
5115 cpus_and(domainspan, domainspan, *cpu_map);
5116
5117 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5118 sched_group_nodes[i] = sg;
5119 for_each_cpu_mask(j, nodemask) {
5120 struct sched_domain *sd;
5121 sd = &per_cpu(node_domains, j);
5122 sd->groups = sg;
5123 if (sd->groups == NULL) {
5124 /* Turn off balancing if we have no groups */
5125 sd->flags = 0;
5126 }
5127 }
5128 if (!sg) {
5129 printk(KERN_WARNING
5130 "Can not alloc domain group for node %d\n", i);
5131 continue;
5132 }
5133 sg->cpu_power = 0;
5134 sg->cpumask = nodemask;
5135 cpus_or(covered, covered, nodemask);
5136 prev = sg;
5137
5138 for (j = 0; j < MAX_NUMNODES; j++) {
5139 cpumask_t tmp, notcovered;
5140 int n = (i + j) % MAX_NUMNODES;
5141
5142 cpus_complement(notcovered, covered);
5143 cpus_and(tmp, notcovered, *cpu_map);
5144 cpus_and(tmp, tmp, domainspan);
5145 if (cpus_empty(tmp))
5146 break;
5147
5148 nodemask = node_to_cpumask(n);
5149 cpus_and(tmp, tmp, nodemask);
5150 if (cpus_empty(tmp))
5151 continue;
5152
5153 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5154 if (!sg) {
5155 printk(KERN_WARNING
5156 "Can not alloc domain group for node %d\n", j);
5157 break;
5158 }
5159 sg->cpu_power = 0;
5160 sg->cpumask = tmp;
5161 cpus_or(covered, covered, tmp);
5162 prev->next = sg;
5163 prev = sg;
5164 }
5165 prev->next = sched_group_nodes[i];
5166 }
5002#endif 5167#endif
5003 5168
5004 /* Calculate CPU power for physical packages and nodes */ 5169 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5182,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5182 sd->groups->cpu_power = power;
5018 5183
5019#ifdef CONFIG_NUMA 5184#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5185 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5186 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5187 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5188 (cpus_weight(sd->groups->cpumask)-1) / 10;
5189 sd->groups->cpu_power = power;
5024 } 5190 }
5025#endif 5191#endif
5026 } 5192 }
5027 5193
5194#ifdef CONFIG_NUMA
5195 for (i = 0; i < MAX_NUMNODES; i++) {
5196 struct sched_group *sg = sched_group_nodes[i];
5197 int j;
5198
5199 if (sg == NULL)
5200 continue;
5201next_sg:
5202 for_each_cpu_mask(j, sg->cpumask) {
5203 struct sched_domain *sd;
5204 int power;
5205
5206 sd = &per_cpu(phys_domains, j);
5207 if (j != first_cpu(sd->groups->cpumask)) {
5208 /*
5209 * Only add "power" once for each
5210 * physical package.
5211 */
5212 continue;
5213 }
5214 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5215 (cpus_weight(sd->groups->cpumask)-1) / 10;
5216
5217 sg->cpu_power += power;
5218 }
5219 sg = sg->next;
5220 if (sg != sched_group_nodes[i])
5221 goto next_sg;
5222 }
5223#endif
5224
5028 /* Attach the domains */ 5225 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5226 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5227 struct sched_domain *sd;
@@ -5039,13 +5236,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5236/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5237 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5238 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5239static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5240{
5044 cpumask_t cpu_default_map; 5241 cpumask_t cpu_default_map;
5045 5242
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5243 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5244 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5245 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5252,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5252
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5253static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5254{
5061 /* Do nothing: everything is statically allocated. */ 5255#ifdef CONFIG_NUMA
5062} 5256 int i;
5257 int cpu;
5258
5259 for_each_cpu_mask(cpu, *cpu_map) {
5260 struct sched_group *sched_group_allnodes
5261 = sched_group_allnodes_bycpu[cpu];
5262 struct sched_group **sched_group_nodes
5263 = sched_group_nodes_bycpu[cpu];
5264
5265 if (sched_group_allnodes) {
5266 kfree(sched_group_allnodes);
5267 sched_group_allnodes_bycpu[cpu] = NULL;
5268 }
5269
5270 if (!sched_group_nodes)
5271 continue;
5272
5273 for (i = 0; i < MAX_NUMNODES; i++) {
5274 cpumask_t nodemask = node_to_cpumask(i);
5275 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5063 5276
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5277 cpus_and(nodemask, nodemask, *cpu_map);
5278 if (cpus_empty(nodemask))
5279 continue;
5280
5281 if (sg == NULL)
5282 continue;
5283 sg = sg->next;
5284next_sg:
5285 oldsg = sg;
5286 sg = sg->next;
5287 kfree(oldsg);
5288 if (oldsg != sched_group_nodes[i])
5289 goto next_sg;
5290 }
5291 kfree(sched_group_nodes);
5292 sched_group_nodes_bycpu[cpu] = NULL;
5293 }
5294#endif
5295}
5065 5296
5066/* 5297/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5298 * Detach sched domains from a group of cpus specified in cpu_map