sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares

Impact: make load-balancing more consistent In the update_shares() path leading to tg_shares_up(), the calculation of per-cpu cfs_rq shares is rather erratic even under moderate task wake up rate. The problem is that the per-cpu tg->cfs_rq load weight used in the sd_rq_weight aggregation and actual redistribution of the cfs_rq->shares are collected at different time. Under moderate system load, we've seen quite a bit of variation on the cfs_rq->shares and ultimately wildly affects sched_entity's load weight. This patch caches the result of initial per-cpu load weight when doing the sum calculation, and then pass it down to update_group_shares_cpu() for redistributing per-cpu cfs_rq shares. This allows consistent total cfs_rq shares across all CPUs. It also simplifies the rounding and zero load weight check. Signed-off-by: Ken Chen <kenchen@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ken Chen <kenchen@google.com> 2008-11-19 01:41:57 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-11-19 12:39:37 -0500
commit: ec4e0e2fe018992d980910db901637c814575914 (patch)
tree: 58f5df8581387afc90774ee2d1923302ae209b3c /kernel/sched.c
parent: 3ac3ba0b396fd99550e08034b0e4c27fdf39c252 (diff)
1 files changed, 15 insertions, 26 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index a4c156d9a4a5..93bfb086e60f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1453,27 +1453,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-        int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
        if (!tg->se[cpu])
                return;
-        rq_weight = tg->cfs_rq[cpu]->load.weight;
+        rq_weight = tg->cfs_rq[cpu]->rq_weight;
-        /*
-         * If there are currently no tasks on the cpu pretend there is one of
-         * average load so that when a new task gets to run here it will not
-         * get delayed by group starvation.
-         */
-        if (!rq_weight) {
-                boost = 1;
-                rq_weight = NICE_0_LOAD;
-        }
-        if (unlikely(rq_weight > sd_rq_weight))
-                rq_weight = sd_rq_weight;
        /*
         *           \Sum shares * rq_weight
@@ -1481,7 +1467,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+        shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1490,11 +1476,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                /*
+                tg->cfs_rq[cpu]->shares = shares;
-                 * record the actual number of shares, not the boosted amount.
-                 */
-                tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-                tg->cfs_rq[cpu]->rq_weight = rq_weight;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1508,13 +1490,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-        unsigned long rq_weight = 0;
+        unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
        for_each_cpu_mask(i, sd->span) {
-                rq_weight += tg->cfs_rq[i]->load.weight;
+                /*
+                 * If there are currently no tasks on the cpu pretend there
+                 * is one of average load so that when a new task gets to
+                 * run here it will not get delayed by group starvation.
+                 */
+                weight = tg->cfs_rq[i]->load.weight;
+                if (!weight)
+                        weight = NICE_0_LOAD;
+                tg->cfs_rq[i]->rq_weight = weight;
+                rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
@@ -1524,9 +1516,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
-        if (!rq_weight)
-                rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
author	Ken Chen <kenchen@google.com>	2008-11-19 01:41:57 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-11-19 12:39:37 -0500
commit	ec4e0e2fe018992d980910db901637c814575914 (patch)
tree	58f5df8581387afc90774ee2d1923302ae209b3c /kernel/sched.c
parent	3ac3ba0b396fd99550e08034b0e4c27fdf39c252 (diff)