1 files changed, 126 insertions, 33 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5c9e67923b7c..8a39fa3e3c6c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -772,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+        long tg_weight;
+        /*
+         * Use this CPU's actual weight instead of the last load_contribution
+         * to gain a more accurate current total weight. See
+         * update_cfs_rq_load_contribution().
+         */
+        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight -= cfs_rq->load_contribution;
+        tg_weight += cfs_rq->load.weight;
+        return tg_weight;
+}
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-        long load_weight, load, shares;
+        long tg_weight, load, shares;
+        tg_weight = calc_tg_weight(tg, cfs_rq);
        load = cfs_rq->load.weight;
-        load_weight = atomic_read(&tg->load_weight);
-        load_weight += load;
-        load_weight -= cfs_rq->load_contribution;
        shares = (tg->shares * load);
-        if (load_weight)
+        if (tg_weight)
-                shares /= load_weight;
+                shares /= tg_weight;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
@@ -1743,7 +1756,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-        if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
                return;
        __return_cfs_rq_runtime(cfs_rq);
@@ -2036,36 +2049,100 @@ static void task_waking_fair(struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ *   s_i = rw_i / \Sum rw_j                                             (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ *   rw_i = {   2,   4,   1,   0 }
+ *   s_i  = { 2/7, 4/7, 1/7,   0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ *   rw'_i = {   3,   4,   1,   0 }
+ *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ *   dw_i = S * (s'_i - s_i)                                            (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
 */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
-                long lw, w;
+                long w, W;
                tg = se->my_q->tg;
-                w = se->my_q->load.weight;
-                /* use this cpu's instantaneous contribution */
+                /*
-                lw = atomic_read(&tg->load_weight);
+                 * W = @wg + \Sum rw_j
-                lw -= se->my_q->load_contribution;
+                 */
-                lw += w + wg;
+                W = wg + calc_tg_weight(tg, se->my_q);
-                wl += w;
+                /*
+                 * w = rw_i + @wl
+                 */
+                w = se->my_q->load.weight + wl;
-                if (lw > 0 && wl < lw)
+                /*
-                        wl = (wl * tg->shares) / lw;
+                 * wl = S * s'_i; see (2)
+                 */
+                if (W > 0 && w < W)
+                        wl = (w * tg->shares) / W;
                else
                        wl = tg->shares;
-                /* zero point is MIN_SHARES */
+                /*
+                 * Per the above, wl is the new se->load.weight value; since
+                 * those are clipped to [MIN_SHARES, ...) do so now. See
+                 * calc_cfs_shares().
+                 */
                if (wl < MIN_SHARES)
                        wl = MIN_SHARES;
+                /*
+                 * wl = dw_i = S * (s'_i - s_i); see (3)
+                 */
                wl -= se->load.weight;
+                /*
+                 * Recursively apply this logic to all parent groups to compute
+                 * the final effective load change on the root group. Since
+                 * only the @tg group gets extra weight, all parent groups can
+                 * only redistribute existing shares. @wl is the shift in shares
+                 * resulting from this level per the above.
+                 */
                wg = 0;
        }
@@ -2249,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        int i;
+        struct sched_group *sg;
+        int i, smt = 0;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2269,25 +2347,40 @@ static int select_idle_sibling(struct task_struct *p, int target)
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        rcu_read_lock();
+again:
        for_each_domain(target, sd) {
+                if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
+                        continue;
+                if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
+                        break;
                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
                        break;
-                for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
+                sg = sd->groups;
-                        if (idle_cpu(i)) {
+                do {
-                                target = i;
+                        if (!cpumask_intersects(sched_group_cpus(sg),
-                                break;
+                                                tsk_cpus_allowed(p)))
+                                goto next;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
                        }
-                }
-                /*
+                        target = cpumask_first_and(sched_group_cpus(sg),
-                 * Lets stop looking for an idle sibling when we reached
+                                        tsk_cpus_allowed(p));
-                 * the domain that spans the current cpu and prev_cpu.
+                        goto done;
-                 */
+next:
-                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                        sg = sg->next;
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+                } while (sg != sd->groups);
-                        break;
+        }
+        if (!smt) {
+                smt = 1;
+                goto again;
        }
+done:
        rcu_read_unlock();
        return target;
@@ -3511,7 +3604,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 }
 /**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu