aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-05-10 18:12:02 -0400
committerIngo Molnar <mingo@kernel.org>2012-05-14 09:05:26 -0400
commit04f733b4afac5dc93ae9b0a8703c60b87def491e (patch)
tree7ef166c4d1c1dffdc993efbf3791d0f745f5a80c /kernel/sched
parent316ad248307fba13be40f01e92a22b89457c32bc (diff)
sched/fair: Revert sched-domain iteration breakage
Patches c22402a2f ("sched/fair: Let minimally loaded cpu balance the group") and 0ce90475 ("sched/fair: Add some serialization to the sched_domain load-balance walk") are horribly broken so revert them. The problem is that while it sounds good to have the minimally loaded cpu do the pulling of more load, the way we walk the domains there is absolutely no guarantee this cpu will actually get to the domain. In fact its very likely it wont. Therefore the higher up the tree we get, the less likely it is we'll balance at all. The first of mask always walks up, while sucky in that it accumulates load on the first cpu and needs extra passes to spread it out at least guarantees a cpu gets up that far and load-balancing happens at all. Since its now always the first and idle cpus should always be able to balance so they get a task as fast as possible we can also do away with the added serialization. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/n/tip-rpuhs5s56aiv1aw7khv9zkw6@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/fair.c19
2 files changed, 7 insertions, 14 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0738036fa569..24922b7ff567 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5976,7 +5976,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5976 5976
5977 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 5977 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
5978 atomic_inc(&sg->sgp->ref); 5978 atomic_inc(&sg->sgp->ref);
5979 sg->balance_cpu = -1;
5980 5979
5981 if (cpumask_test_cpu(cpu, sg_span)) 5980 if (cpumask_test_cpu(cpu, sg_span))
5982 groups = sg; 5981 groups = sg;
@@ -6052,7 +6051,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6052 6051
6053 cpumask_clear(sched_group_cpus(sg)); 6052 cpumask_clear(sched_group_cpus(sg));
6054 sg->sgp->power = 0; 6053 sg->sgp->power = 0;
6055 sg->balance_cpu = -1;
6056 6054
6057 for_each_cpu(j, span) { 6055 for_each_cpu(j, span) {
6058 if (get_group(j, sdd, NULL) != group) 6056 if (get_group(j, sdd, NULL) != group)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9bd3366dbb1c..a259a614b394 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3776,8 +3776,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3776 int *balance, struct sg_lb_stats *sgs) 3776 int *balance, struct sg_lb_stats *sgs)
3777{ 3777{
3778 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; 3778 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
3779 unsigned int balance_cpu = -1; 3779 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3780 unsigned long balance_load = ~0UL;
3781 unsigned long avg_load_per_task = 0; 3780 unsigned long avg_load_per_task = 0;
3782 int i; 3781 int i;
3783 3782
@@ -3794,11 +3793,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3794 3793
3795 /* Bias balancing toward cpus of our domain */ 3794 /* Bias balancing toward cpus of our domain */
3796 if (local_group) { 3795 if (local_group) {
3797 load = target_load(i, load_idx); 3796 if (idle_cpu(i) && !first_idle_cpu) {
3798 if (load < balance_load || idle_cpu(i)) { 3797 first_idle_cpu = 1;
3799 balance_load = load;
3800 balance_cpu = i; 3798 balance_cpu = i;
3801 } 3799 }
3800
3801 load = target_load(i, load_idx);
3802 } else { 3802 } else {
3803 load = source_load(i, load_idx); 3803 load = source_load(i, load_idx);
3804 if (load > max_cpu_load) { 3804 if (load > max_cpu_load) {
@@ -3824,8 +3824,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3824 */ 3824 */
3825 if (local_group) { 3825 if (local_group) {
3826 if (env->idle != CPU_NEWLY_IDLE) { 3826 if (env->idle != CPU_NEWLY_IDLE) {
3827 if (balance_cpu != env->dst_cpu || 3827 if (balance_cpu != env->dst_cpu) {
3828 cmpxchg(&group->balance_cpu, -1, balance_cpu) != -1) {
3829 *balance = 0; 3828 *balance = 0;
3830 return; 3829 return;
3831 } 3830 }
@@ -4919,7 +4918,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4919 int balance = 1; 4918 int balance = 1;
4920 struct rq *rq = cpu_rq(cpu); 4919 struct rq *rq = cpu_rq(cpu);
4921 unsigned long interval; 4920 unsigned long interval;
4922 struct sched_domain *sd, *last = NULL; 4921 struct sched_domain *sd;
4923 /* Earliest time when we have to do rebalance again */ 4922 /* Earliest time when we have to do rebalance again */
4924 unsigned long next_balance = jiffies + 60*HZ; 4923 unsigned long next_balance = jiffies + 60*HZ;
4925 int update_next_balance = 0; 4924 int update_next_balance = 0;
@@ -4929,7 +4928,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4929 4928
4930 rcu_read_lock(); 4929 rcu_read_lock();
4931 for_each_domain(cpu, sd) { 4930 for_each_domain(cpu, sd) {
4932 last = sd;
4933 if (!(sd->flags & SD_LOAD_BALANCE)) 4931 if (!(sd->flags & SD_LOAD_BALANCE))
4934 continue; 4932 continue;
4935 4933
@@ -4974,9 +4972,6 @@ out:
4974 if (!balance) 4972 if (!balance)
4975 break; 4973 break;
4976 } 4974 }
4977 for (sd = last; sd; sd = sd->child)
4978 (void)cmpxchg(&sd->groups->balance_cpu, cpu, -1);
4979
4980 rcu_read_unlock(); 4975 rcu_read_unlock();
4981 4976
4982 /* 4977 /*