aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c340
1 files changed, 286 insertions, 54 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..18b95520a2e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1478 1478
1479/** 1479/**
1480 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1481 * @rq: runqueue associated with task-switch
1481 * @prev: the thread we just switched away from. 1482 * @prev: the thread we just switched away from.
1482 * 1483 *
1483 * finish_task_switch must be called after the context switch, paired 1484 * finish_task_switch must be called after the context switch, paired
@@ -4779,7 +4780,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4780 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4781 * hold the hotplug lock.
4781 */ 4782 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4783static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4784{
4784 runqueue_t *rq = cpu_rq(cpu); 4785 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4786 struct sched_domain *tmp;
@@ -4802,7 +4803,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4803}
4803 4804
4804/* cpus with isolated domains */ 4805/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4806static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4807
4807/* Setup the mask of cpus configured for isolated domains */ 4808/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4809static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4831,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4831 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4832 * and ->cpu_power to 0.
4832 */ 4833 */
4833void init_sched_build_groups(struct sched_group groups[], 4834static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4835 int (*group_fn)(int cpu))
4835{ 4836{
4836 struct sched_group *first = NULL, *last = NULL; 4837 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4838 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4865,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4865 last->next = first;
4865} 4866}
4866 4867
4868#define SD_NODES_PER_DOMAIN 16
4867 4869
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4870#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4871/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4872 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4873 * @node: node whose sched_domain we're building
4872#else 4874 * @used_nodes: nodes already in the sched_domain
4875 *
4876 * Find the next node to include in a given scheduling domain. Simply
4877 * finds the closest node not already in the @used_nodes map.
4878 *
4879 * Should use nodemask_t.
4880 */
4881static int find_next_best_node(int node, unsigned long *used_nodes)
4882{
4883 int i, n, val, min_val, best_node = 0;
4884
4885 min_val = INT_MAX;
4886
4887 for (i = 0; i < MAX_NUMNODES; i++) {
4888 /* Start at @node */
4889 n = (node + i) % MAX_NUMNODES;
4890
4891 if (!nr_cpus_node(n))
4892 continue;
4893
4894 /* Skip already used nodes */
4895 if (test_bit(n, used_nodes))
4896 continue;
4897
4898 /* Simple min distance search */
4899 val = node_distance(node, n);
4900
4901 if (val < min_val) {
4902 min_val = val;
4903 best_node = n;
4904 }
4905 }
4906
4907 set_bit(best_node, used_nodes);
4908 return best_node;
4909}
4910
4911/**
4912 * sched_domain_node_span - get a cpumask for a node's sched_domain
4913 * @node: node whose cpumask we're constructing
4914 * @size: number of nodes to include in this span
4915 *
4916 * Given a node, construct a good cpumask for its sched_domain to span. It
4917 * should be one that prevents unnecessary balancing, but also spreads tasks
4918 * out optimally.
4919 */
4920static cpumask_t sched_domain_node_span(int node)
4921{
4922 int i;
4923 cpumask_t span, nodemask;
4924 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
4925
4926 cpus_clear(span);
4927 bitmap_zero(used_nodes, MAX_NUMNODES);
4928
4929 nodemask = node_to_cpumask(node);
4930 cpus_or(span, span, nodemask);
4931 set_bit(node, used_nodes);
4932
4933 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
4934 int next_node = find_next_best_node(node, used_nodes);
4935 nodemask = node_to_cpumask(next_node);
4936 cpus_or(span, span, nodemask);
4937 }
4938
4939 return span;
4940}
4941#endif
4942
4943/*
4944 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
4945 * can switch it on easily if needed.
4946 */
4873#ifdef CONFIG_SCHED_SMT 4947#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4948static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 4949static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4965,20 @@ static int cpu_to_phys_group(int cpu)
4891} 4965}
4892 4966
4893#ifdef CONFIG_NUMA 4967#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 4968/*
4905 * The domains setup code relies on siblings not spanning 4969 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 4970 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 4971 * gets dynamically allocated.
4908 */ 4972 */
4909static void check_sibling_maps(void) 4973static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 4974static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911 int i, j;
4912 4975
4913 for_each_online_cpu(i) { 4976static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 4977static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 4978
4916 printk(KERN_INFO "warning: CPU %d siblings map " 4979static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 4980{
4918 "them.\n", i); 4981 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 4982}
4925#endif 4983#endif
4926 4984
@@ -4928,9 +4986,24 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 4986 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 4987 * to the individual cpus
4930 */ 4988 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 4989void build_sched_domains(const cpumask_t *cpu_map)
4932{ 4990{
4933 int i; 4991 int i;
4992#ifdef CONFIG_NUMA
4993 struct sched_group **sched_group_nodes = NULL;
4994 struct sched_group *sched_group_allnodes = NULL;
4995
4996 /*
4997 * Allocate the per-node list of sched groups
4998 */
4999 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5000 GFP_ATOMIC);
5001 if (!sched_group_nodes) {
5002 printk(KERN_WARNING "Can not alloc sched group node list\n");
5003 return;
5004 }
5005 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5006#endif
4934 5007
4935 /* 5008 /*
4936 * Set up domains for cpus specified by the cpu_map. 5009 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5016,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5016 cpus_and(nodemask, nodemask, *cpu_map);
4944 5017
4945#ifdef CONFIG_NUMA 5018#ifdef CONFIG_NUMA
5019 if (cpus_weight(*cpu_map)
5020 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5021 if (!sched_group_allnodes) {
5022 sched_group_allnodes
5023 = kmalloc(sizeof(struct sched_group)
5024 * MAX_NUMNODES,
5025 GFP_KERNEL);
5026 if (!sched_group_allnodes) {
5027 printk(KERN_WARNING
5028 "Can not alloc allnodes sched group\n");
5029 break;
5030 }
5031 sched_group_allnodes_bycpu[i]
5032 = sched_group_allnodes;
5033 }
5034 sd = &per_cpu(allnodes_domains, i);
5035 *sd = SD_ALLNODES_INIT;
5036 sd->span = *cpu_map;
5037 group = cpu_to_allnodes_group(i);
5038 sd->groups = &sched_group_allnodes[group];
5039 p = sd;
5040 } else
5041 p = NULL;
5042
4946 sd = &per_cpu(node_domains, i); 5043 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5044 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5045 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5046 sd->parent = p;
5047 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5048#endif
4952 5049
4953 p = sd; 5050 p = sd;
@@ -4972,7 +5069,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5069
4973#ifdef CONFIG_SCHED_SMT 5070#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5071 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5072 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5073 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5074 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5075 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5094,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5094
4998#ifdef CONFIG_NUMA 5095#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5096 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5097 if (sched_group_allnodes)
5001 &cpu_to_node_group); 5098 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5099 &cpu_to_allnodes_group);
5100
5101 for (i = 0; i < MAX_NUMNODES; i++) {
5102 /* Set up node groups */
5103 struct sched_group *sg, *prev;
5104 cpumask_t nodemask = node_to_cpumask(i);
5105 cpumask_t domainspan;
5106 cpumask_t covered = CPU_MASK_NONE;
5107 int j;
5108
5109 cpus_and(nodemask, nodemask, *cpu_map);
5110 if (cpus_empty(nodemask)) {
5111 sched_group_nodes[i] = NULL;
5112 continue;
5113 }
5114
5115 domainspan = sched_domain_node_span(i);
5116 cpus_and(domainspan, domainspan, *cpu_map);
5117
5118 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5119 sched_group_nodes[i] = sg;
5120 for_each_cpu_mask(j, nodemask) {
5121 struct sched_domain *sd;
5122 sd = &per_cpu(node_domains, j);
5123 sd->groups = sg;
5124 if (sd->groups == NULL) {
5125 /* Turn off balancing if we have no groups */
5126 sd->flags = 0;
5127 }
5128 }
5129 if (!sg) {
5130 printk(KERN_WARNING
5131 "Can not alloc domain group for node %d\n", i);
5132 continue;
5133 }
5134 sg->cpu_power = 0;
5135 sg->cpumask = nodemask;
5136 cpus_or(covered, covered, nodemask);
5137 prev = sg;
5138
5139 for (j = 0; j < MAX_NUMNODES; j++) {
5140 cpumask_t tmp, notcovered;
5141 int n = (i + j) % MAX_NUMNODES;
5142
5143 cpus_complement(notcovered, covered);
5144 cpus_and(tmp, notcovered, *cpu_map);
5145 cpus_and(tmp, tmp, domainspan);
5146 if (cpus_empty(tmp))
5147 break;
5148
5149 nodemask = node_to_cpumask(n);
5150 cpus_and(tmp, tmp, nodemask);
5151 if (cpus_empty(tmp))
5152 continue;
5153
5154 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5155 if (!sg) {
5156 printk(KERN_WARNING
5157 "Can not alloc domain group for node %d\n", j);
5158 break;
5159 }
5160 sg->cpu_power = 0;
5161 sg->cpumask = tmp;
5162 cpus_or(covered, covered, tmp);
5163 prev->next = sg;
5164 prev = sg;
5165 }
5166 prev->next = sched_group_nodes[i];
5167 }
5002#endif 5168#endif
5003 5169
5004 /* Calculate CPU power for physical packages and nodes */ 5170 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5183,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5183 sd->groups->cpu_power = power;
5018 5184
5019#ifdef CONFIG_NUMA 5185#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5186 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5187 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5188 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5189 (cpus_weight(sd->groups->cpumask)-1) / 10;
5190 sd->groups->cpu_power = power;
5024 } 5191 }
5025#endif 5192#endif
5026 } 5193 }
5027 5194
5195#ifdef CONFIG_NUMA
5196 for (i = 0; i < MAX_NUMNODES; i++) {
5197 struct sched_group *sg = sched_group_nodes[i];
5198 int j;
5199
5200 if (sg == NULL)
5201 continue;
5202next_sg:
5203 for_each_cpu_mask(j, sg->cpumask) {
5204 struct sched_domain *sd;
5205 int power;
5206
5207 sd = &per_cpu(phys_domains, j);
5208 if (j != first_cpu(sd->groups->cpumask)) {
5209 /*
5210 * Only add "power" once for each
5211 * physical package.
5212 */
5213 continue;
5214 }
5215 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5216 (cpus_weight(sd->groups->cpumask)-1) / 10;
5217
5218 sg->cpu_power += power;
5219 }
5220 sg = sg->next;
5221 if (sg != sched_group_nodes[i])
5222 goto next_sg;
5223 }
5224#endif
5225
5028 /* Attach the domains */ 5226 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5227 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5228 struct sched_domain *sd;
@@ -5039,13 +5237,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5237/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5238 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5239 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5240static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5241{
5044 cpumask_t cpu_default_map; 5242 cpumask_t cpu_default_map;
5045 5243
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5244 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5245 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5246 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5253,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5253
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5254static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5255{
5061 /* Do nothing: everything is statically allocated. */ 5256#ifdef CONFIG_NUMA
5062} 5257 int i;
5258 int cpu;
5259
5260 for_each_cpu_mask(cpu, *cpu_map) {
5261 struct sched_group *sched_group_allnodes
5262 = sched_group_allnodes_bycpu[cpu];
5263 struct sched_group **sched_group_nodes
5264 = sched_group_nodes_bycpu[cpu];
5265
5266 if (sched_group_allnodes) {
5267 kfree(sched_group_allnodes);
5268 sched_group_allnodes_bycpu[cpu] = NULL;
5269 }
5270
5271 if (!sched_group_nodes)
5272 continue;
5273
5274 for (i = 0; i < MAX_NUMNODES; i++) {
5275 cpumask_t nodemask = node_to_cpumask(i);
5276 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5063 5277
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5278 cpus_and(nodemask, nodemask, *cpu_map);
5279 if (cpus_empty(nodemask))
5280 continue;
5281
5282 if (sg == NULL)
5283 continue;
5284 sg = sg->next;
5285next_sg:
5286 oldsg = sg;
5287 sg = sg->next;
5288 kfree(oldsg);
5289 if (oldsg != sched_group_nodes[i])
5290 goto next_sg;
5291 }
5292 kfree(sched_group_nodes);
5293 sched_group_nodes_bycpu[cpu] = NULL;
5294 }
5295#endif
5296}
5065 5297
5066/* 5298/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5299 * Detach sched domains from a group of cpus specified in cpu_map