aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-06-08 17:59:29 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-06-08 17:59:29 -0400
commit72494504498ff5ac2f086a83473d4dd1ca490bd3 (patch)
tree7f1ceab43de3580235f1a56f2ae865901c09e4d7 /kernel/sched
parentcd96891d48a945ca2011fbeceda73813d6286195 (diff)
parenta841f8cef4bb124f0f5563314d0beaf2e1249d72 (diff)
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Fix the relax_domain_level boot parameter sched: Validate assumptions in sched_init_numa() sched: Always initialize cpu-power sched: Fix domain iteration sched/rt: Fix lockdep annotation within find_lock_lowest_rq() sched/numa: Load balance between remote nodes sched/x86: Calculate booted cores after construction of sibling_mask
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c187
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h2
4 files changed, 159 insertions, 39 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c46958e26121..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5556 5556
5557#ifdef CONFIG_SCHED_DEBUG 5557#ifdef CONFIG_SCHED_DEBUG
5558 5558
5559static __read_mostly int sched_domain_debug_enabled; 5559static __read_mostly int sched_debug_enabled;
5560 5560
5561static int __init sched_domain_debug_setup(char *str) 5561static int __init sched_debug_setup(char *str)
5562{ 5562{
5563 sched_domain_debug_enabled = 1; 5563 sched_debug_enabled = 1;
5564 5564
5565 return 0; 5565 return 0;
5566} 5566}
5567early_param("sched_debug", sched_domain_debug_setup); 5567early_param("sched_debug", sched_debug_setup);
5568
5569static inline bool sched_debug(void)
5570{
5571 return sched_debug_enabled;
5572}
5568 5573
5569static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5574static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5570 struct cpumask *groupmask) 5575 struct cpumask *groupmask)
@@ -5604,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5604 break; 5609 break;
5605 } 5610 }
5606 5611
5607 if (!group->sgp->power) { 5612 /*
5613 * Even though we initialize ->power to something semi-sane,
5614 * we leave power_orig unset. This allows us to detect if
5615 * domain iteration is still funny without causing /0 traps.
5616 */
5617 if (!group->sgp->power_orig) {
5608 printk(KERN_CONT "\n"); 5618 printk(KERN_CONT "\n");
5609 printk(KERN_ERR "ERROR: domain->cpu_power not " 5619 printk(KERN_ERR "ERROR: domain->cpu_power not "
5610 "set\n"); 5620 "set\n");
@@ -5652,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5652{ 5662{
5653 int level = 0; 5663 int level = 0;
5654 5664
5655 if (!sched_domain_debug_enabled) 5665 if (!sched_debug_enabled)
5656 return; 5666 return;
5657 5667
5658 if (!sd) { 5668 if (!sd) {
@@ -5673,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5673} 5683}
5674#else /* !CONFIG_SCHED_DEBUG */ 5684#else /* !CONFIG_SCHED_DEBUG */
5675# define sched_domain_debug(sd, cpu) do { } while (0) 5685# define sched_domain_debug(sd, cpu) do { } while (0)
5686static inline bool sched_debug(void)
5687{
5688 return false;
5689}
5676#endif /* CONFIG_SCHED_DEBUG */ 5690#endif /* CONFIG_SCHED_DEBUG */
5677 5691
5678static int sd_degenerate(struct sched_domain *sd) 5692static int sd_degenerate(struct sched_domain *sd)
@@ -5994,6 +6008,44 @@ struct sched_domain_topology_level {
5994 struct sd_data data; 6008 struct sd_data data;
5995}; 6009};
5996 6010
6011/*
6012 * Build an iteration mask that can exclude certain CPUs from the upwards
6013 * domain traversal.
6014 *
6015 * Asymmetric node setups can result in situations where the domain tree is of
6016 * unequal depth, make sure to skip domains that already cover the entire
6017 * range.
6018 *
6019 * In that case build_sched_domains() will have terminated the iteration early
6020 * and our sibling sd spans will be empty. Domains should always include the
6021 * cpu they're built on, so check that.
6022 *
6023 */
6024static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6025{
6026 const struct cpumask *span = sched_domain_span(sd);
6027 struct sd_data *sdd = sd->private;
6028 struct sched_domain *sibling;
6029 int i;
6030
6031 for_each_cpu(i, span) {
6032 sibling = *per_cpu_ptr(sdd->sd, i);
6033 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6034 continue;
6035
6036 cpumask_set_cpu(i, sched_group_mask(sg));
6037 }
6038}
6039
6040/*
6041 * Return the canonical balance cpu for this group, this is the first cpu
6042 * of this group that's also in the iteration mask.
6043 */
6044int group_balance_cpu(struct sched_group *sg)
6045{
6046 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6047}
6048
5997static int 6049static int
5998build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6050build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5999{ 6051{
@@ -6012,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6012 if (cpumask_test_cpu(i, covered)) 6064 if (cpumask_test_cpu(i, covered))
6013 continue; 6065 continue;
6014 6066
6067 child = *per_cpu_ptr(sdd->sd, i);
6068
6069 /* See the comment near build_group_mask(). */
6070 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6071 continue;
6072
6015 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6073 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6016 GFP_KERNEL, cpu_to_node(cpu)); 6074 GFP_KERNEL, cpu_to_node(cpu));
6017 6075
@@ -6019,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6019 goto fail; 6077 goto fail;
6020 6078
6021 sg_span = sched_group_cpus(sg); 6079 sg_span = sched_group_cpus(sg);
6022
6023 child = *per_cpu_ptr(sdd->sd, i);
6024 if (child->child) { 6080 if (child->child) {
6025 child = child->child; 6081 child = child->child;
6026 cpumask_copy(sg_span, sched_domain_span(child)); 6082 cpumask_copy(sg_span, sched_domain_span(child));
@@ -6030,13 +6086,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6030 cpumask_or(covered, covered, sg_span); 6086 cpumask_or(covered, covered, sg_span);
6031 6087
6032 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 6088 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6033 atomic_inc(&sg->sgp->ref); 6089 if (atomic_inc_return(&sg->sgp->ref) == 1)
6090 build_group_mask(sd, sg);
6034 6091
6092 /*
6093 * Initialize sgp->power such that even if we mess up the
6094 * domains and no possible iteration will get us here, we won't
6095 * die on a /0 trap.
6096 */
6097 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6098
6099 /*
6100 * Make sure the first group of this domain contains the
6101 * canonical balance cpu. Otherwise the sched_domain iteration
6102 * breaks. See update_sg_lb_stats().
6103 */
6035 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 6104 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6036 cpumask_first(sg_span) == cpu) { 6105 group_balance_cpu(sg) == cpu)
6037 WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
6038 groups = sg; 6106 groups = sg;
6039 }
6040 6107
6041 if (!first) 6108 if (!first)
6042 first = sg; 6109 first = sg;
@@ -6109,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6109 6176
6110 cpumask_clear(sched_group_cpus(sg)); 6177 cpumask_clear(sched_group_cpus(sg));
6111 sg->sgp->power = 0; 6178 sg->sgp->power = 0;
6179 cpumask_setall(sched_group_mask(sg));
6112 6180
6113 for_each_cpu(j, span) { 6181 for_each_cpu(j, span) {
6114 if (get_group(j, sdd, NULL) != group) 6182 if (get_group(j, sdd, NULL) != group)
@@ -6150,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6150 sg = sg->next; 6218 sg = sg->next;
6151 } while (sg != sd->groups); 6219 } while (sg != sd->groups);
6152 6220
6153 if (cpu != group_first_cpu(sg)) 6221 if (cpu != group_balance_cpu(sg))
6154 return; 6222 return;
6155 6223
6156 update_group_power(sd, cpu); 6224 update_group_power(sd, cpu);
@@ -6200,11 +6268,8 @@ int sched_domain_level_max;
6200 6268
6201static int __init setup_relax_domain_level(char *str) 6269static int __init setup_relax_domain_level(char *str)
6202{ 6270{
6203 unsigned long val; 6271 if (kstrtoint(str, 0, &default_relax_domain_level))
6204 6272 pr_warn("Unable to set relax_domain_level\n");
6205 val = simple_strtoul(str, NULL, 0);
6206 if (val < sched_domain_level_max)
6207 default_relax_domain_level = val;
6208 6273
6209 return 1; 6274 return 1;
6210} 6275}
@@ -6314,14 +6379,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6314#ifdef CONFIG_NUMA 6379#ifdef CONFIG_NUMA
6315 6380
6316static int sched_domains_numa_levels; 6381static int sched_domains_numa_levels;
6317static int sched_domains_numa_scale;
6318static int *sched_domains_numa_distance; 6382static int *sched_domains_numa_distance;
6319static struct cpumask ***sched_domains_numa_masks; 6383static struct cpumask ***sched_domains_numa_masks;
6320static int sched_domains_curr_level; 6384static int sched_domains_curr_level;
6321 6385
6322static inline int sd_local_flags(int level) 6386static inline int sd_local_flags(int level)
6323{ 6387{
6324 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6388 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6325 return 0; 6389 return 0;
6326 6390
6327 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6391 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6379,6 +6443,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6379 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6443 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6380} 6444}
6381 6445
6446static void sched_numa_warn(const char *str)
6447{
6448 static int done = false;
6449 int i,j;
6450
6451 if (done)
6452 return;
6453
6454 done = true;
6455
6456 printk(KERN_WARNING "ERROR: %s\n\n", str);
6457
6458 for (i = 0; i < nr_node_ids; i++) {
6459 printk(KERN_WARNING " ");
6460 for (j = 0; j < nr_node_ids; j++)
6461 printk(KERN_CONT "%02d ", node_distance(i,j));
6462 printk(KERN_CONT "\n");
6463 }
6464 printk(KERN_WARNING "\n");
6465}
6466
6467static bool find_numa_distance(int distance)
6468{
6469 int i;
6470
6471 if (distance == node_distance(0, 0))
6472 return true;
6473
6474 for (i = 0; i < sched_domains_numa_levels; i++) {
6475 if (sched_domains_numa_distance[i] == distance)
6476 return true;
6477 }
6478
6479 return false;
6480}
6481
6382static void sched_init_numa(void) 6482static void sched_init_numa(void)
6383{ 6483{
6384 int next_distance, curr_distance = node_distance(0, 0); 6484 int next_distance, curr_distance = node_distance(0, 0);
@@ -6386,7 +6486,6 @@ static void sched_init_numa(void)
6386 int level = 0; 6486 int level = 0;
6387 int i, j, k; 6487 int i, j, k;
6388 6488
6389 sched_domains_numa_scale = curr_distance;
6390 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6489 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6391 if (!sched_domains_numa_distance) 6490 if (!sched_domains_numa_distance)
6392 return; 6491 return;
@@ -6397,23 +6496,41 @@ static void sched_init_numa(void)
6397 * 6496 *
6398 * Assumes node_distance(0,j) includes all distances in 6497 * Assumes node_distance(0,j) includes all distances in
6399 * node_distance(i,j) in order to avoid cubic time. 6498 * node_distance(i,j) in order to avoid cubic time.
6400 *
6401 * XXX: could be optimized to O(n log n) by using sort()
6402 */ 6499 */
6403 next_distance = curr_distance; 6500 next_distance = curr_distance;
6404 for (i = 0; i < nr_node_ids; i++) { 6501 for (i = 0; i < nr_node_ids; i++) {
6405 for (j = 0; j < nr_node_ids; j++) { 6502 for (j = 0; j < nr_node_ids; j++) {
6406 int distance = node_distance(0, j); 6503 for (k = 0; k < nr_node_ids; k++) {
6407 if (distance > curr_distance && 6504 int distance = node_distance(i, k);
6408 (distance < next_distance || 6505
6409 next_distance == curr_distance)) 6506 if (distance > curr_distance &&
6410 next_distance = distance; 6507 (distance < next_distance ||
6508 next_distance == curr_distance))
6509 next_distance = distance;
6510
6511 /*
6512 * While not a strong assumption it would be nice to know
6513 * about cases where if node A is connected to B, B is not
6514 * equally connected to A.
6515 */
6516 if (sched_debug() && node_distance(k, i) != distance)
6517 sched_numa_warn("Node-distance not symmetric");
6518
6519 if (sched_debug() && i && !find_numa_distance(distance))
6520 sched_numa_warn("Node-0 not representative");
6521 }
6522 if (next_distance != curr_distance) {
6523 sched_domains_numa_distance[level++] = next_distance;
6524 sched_domains_numa_levels = level;
6525 curr_distance = next_distance;
6526 } else break;
6411 } 6527 }
6412 if (next_distance != curr_distance) { 6528
6413 sched_domains_numa_distance[level++] = next_distance; 6529 /*
6414 sched_domains_numa_levels = level; 6530 * In case of sched_debug() we verify the above assumption.
6415 curr_distance = next_distance; 6531 */
6416 } else break; 6532 if (!sched_debug())
6533 break;
6417 } 6534 }
6418 /* 6535 /*
6419 * 'level' contains the number of unique distances, excluding the 6536 * 'level' contains the number of unique distances, excluding the
@@ -6525,7 +6642,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6525 6642
6526 *per_cpu_ptr(sdd->sg, j) = sg; 6643 *per_cpu_ptr(sdd->sg, j) = sg;
6527 6644
6528 sgp = kzalloc_node(sizeof(struct sched_group_power), 6645 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6529 GFP_KERNEL, cpu_to_node(j)); 6646 GFP_KERNEL, cpu_to_node(j));
6530 if (!sgp) 6647 if (!sgp)
6531 return -ENOMEM; 6648 return -ENOMEM;
@@ -6578,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6578 if (!sd) 6695 if (!sd)
6579 return child; 6696 return child;
6580 6697
6581 set_domain_attribute(sd, attr);
6582 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6698 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6583 if (child) { 6699 if (child) {
6584 sd->level = child->level + 1; 6700 sd->level = child->level + 1;
@@ -6586,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6586 child->parent = sd; 6702 child->parent = sd;
6587 } 6703 }
6588 sd->child = child; 6704 sd->child = child;
6705 set_domain_attribute(sd, attr);
6589 6706
6590 return sd; 6707 return sd;
6591} 6708}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5583f9588e7..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3602,7 +3602,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
3602 } while (group != child->groups); 3602 } while (group != child->groups);
3603 } 3603 }
3604 3604
3605 sdg->sgp->power = power; 3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3606} 3606}
3607 3607
3608/* 3608/*
@@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3652 int i; 3652 int i;
3653 3653
3654 if (local_group) 3654 if (local_group)
3655 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3656 3656
3657 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3658 max_cpu_load = 0; 3658 max_cpu_load = 0;
@@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3667 3667
3668 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3669 if (local_group) { 3669 if (local_group) {
3670 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3671 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3672 balance_cpu = i; 3673 balance_cpu = i;
3673 } 3674 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2a4e8dffbd6b..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1562,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1562 task_running(rq, task) || 1562 task_running(rq, task) ||
1563 !task->on_rq)) { 1563 !task->on_rq)) {
1564 1564
1565 raw_spin_unlock(&lowest_rq->lock); 1565 double_unlock_balance(rq, lowest_rq);
1566 lowest_rq = NULL; 1566 lowest_rq = NULL;
1567 break; 1567 break;
1568 } 1568 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526DECLARE_PER_CPU(struct sched_domain *, sd_llc); 526DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527DECLARE_PER_CPU(int, sd_llc_id); 527DECLARE_PER_CPU(int, sd_llc_id);
528 528
529extern int group_balance_cpu(struct sched_group *sg);
530
529#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
530 532
531#include "stats.h" 533#include "stats.h"