sched: Maintain the load contribution of blocked entities

We are currently maintaining: runnable_load(cfs_rq) = \Sum task_load(t) For all running children t of cfs_rq. While this can be naturally updated for tasks in a runnable state (as they are scheduled); this does not account for the load contributed by blocked task entities. This can be solved by introducing a separate accounting for blocked load: blocked_load(cfs_rq) = \Sum runnable(b) * weight(b) Obviously we do not want to iterate over all blocked entities to account for their decay, we instead observe that: runnable_load(t) = \Sum p_i*y^i and that to account for an additional idle period we only need to compute: y*runnable_load(t). This means that we can compute all blocked entities at once by evaluating: blocked_load(cfs_rq)` = y * blocked_load(cfs_rq) Finally we maintain a decay counter so that when a sleeping entity re-awakens we can determine how much of its load should be removed from the blocked sum. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Ben Segall <bsegall@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Paul Turner <pjt@google.com> 2012-10-04 07:18:30 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-10-24 04:27:22 -0400
commit: 9ee474f55664ff63111c843099d365e7ecffb56f (patch)
tree: 745a678b0d3cd72ba42b67d0b6ac6c3872b14229 /kernel/sched/fair.c
parent: 2dac754e10a5d41d94d2d2365c0345d4f215a266 (diff)
1 files changed, 115 insertions, 13 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77af759e5675..83194175e841 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq);
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                }
                cfs_rq->on_list = 1;
+                /* We should have no load, but we need to update last_decay. */
+                update_cfs_rq_blocked_load(cfs_rq);
        }
 }
@@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
        return decayed;
 }
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline void __synchronize_entity_decay(struct sched_entity *se)
+{
+        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        u64 decays = atomic64_read(&cfs_rq->decay_counter);
+        decays -= se->avg.decay_count;
+        if (!decays)
+                return;
+        se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+        se->avg.decay_count = 0;
+}
 /* Compute the current contribution to load_avg by se, return any delta */
 static long __update_entity_load_avg_contrib(struct sched_entity *se)
 {
@@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
        return se->avg.load_avg_contrib - old_contrib;
 }
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+                                                 long load_contrib)
+{
+        if (likely(load_contrib < cfs_rq->blocked_load_avg))
+                cfs_rq->blocked_load_avg -= load_contrib;
+        else
+                cfs_rq->blocked_load_avg = 0;
+}
 /* Update a sched_entity's runnable average */
-static inline void update_entity_load_avg(struct sched_entity *se)
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                          int update_cfs_rq)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        long contrib_delta;
@@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se)
                return;
        contrib_delta = __update_entity_load_avg_contrib(se);
+        if (!update_cfs_rq)
+                return;
        if (se->on_rq)
                cfs_rq->runnable_load_avg += contrib_delta;
+        else
+                subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq)
+{
+        u64 now = rq_of(cfs_rq)->clock_task >> 20;
+        u64 decays;
+        decays = now - cfs_rq->last_decay;
+        if (!decays)
+                return;
+        cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+                                              decays);
+        atomic64_add(decays, &cfs_rq->decay_counter);
+        cfs_rq->last_decay = now;
 }
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se)
+                                                  struct sched_entity *se,
+                                                  int wakeup)
 {
-        update_entity_load_avg(se);
+        /* we track migrations using entity decay_count == 0 */
+        if (unlikely(!se->avg.decay_count)) {
+                se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+                wakeup = 0;
+        } else {
+                __synchronize_entity_decay(se);
+        }
+        if (wakeup)
+                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+        update_entity_load_avg(se, 0);
        cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+        update_cfs_rq_blocked_load(cfs_rq);
 }
-/* Remove se's load from this cfs_rq child load-average */
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se)
+                                                  struct sched_entity *se,
+                                                  int sleep)
 {
-        update_entity_load_avg(se);
+        update_entity_load_avg(se, 1);
        cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+        if (sleep) {
+                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+        } /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
 #else
-static inline void update_entity_load_avg(struct sched_entity *se) {}
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                          int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se) {}
+                                           struct sched_entity *se,
+                                           int wakeup) {}
 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
-                                                  struct sched_entity *se) {}
+                                           struct sched_entity *se,
+                                           int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {}
 #endif
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         */
        update_curr(cfs_rq);
        update_cfs_load(cfs_rq, 0);
-        enqueue_entity_load_avg(cfs_rq, se);
+        enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
@@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-        dequeue_entity_load_avg(cfs_rq, se);
+        dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
        update_stats_dequeue(cfs_rq, se);
        if (flags & DEQUEUE_SLEEP) {
@@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
-                update_entity_load_avg(prev);
+                update_entity_load_avg(prev, 1);
        }
        cfs_rq->curr = NULL;
 }
@@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        /*
         * Ensure that runnable average is periodically updated.
         */
-        update_entity_load_avg(curr);
+        update_entity_load_avg(curr, 1);
+        update_cfs_rq_blocked_load(cfs_rq);
        /*
         * Update share accounting for long-running entities.
@@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+                update_entity_load_avg(se, 1);
        }
        if (!se) {
@@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+                update_entity_load_avg(se, 1);
        }
        if (!se) {
@@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
        update_rq_clock(rq);
        update_cfs_load(cfs_rq, 1);
+        update_cfs_rq_blocked_load(cfs_rq);
        /*
         * We need to update shares after updating tg->load_weight in
@@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                place_entity(cfs_rq, se, 0);
                se->vruntime -= cfs_rq->min_vruntime;
        }
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+        /*
+        * Remove our load from contribution when we leave sched_fair
+        * and ensure we don't carry in an old decay_count if we
+        * switch back.
+        */
+        if (p->se.avg.decay_count) {
+                struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+                __synchronize_entity_decay(&p->se);
+                subtract_blocked_load_contrib(cfs_rq,
+                                p->se.avg.load_avg_contrib);
+        }
+#endif
 }
 /*
@@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+        atomic64_set(&cfs_rq->decay_counter, 1);
+#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
author	Paul Turner <pjt@google.com>	2012-10-04 07:18:30 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-10-24 04:27:22 -0400
commit	9ee474f55664ff63111c843099d365e7ecffb56f (patch)
tree	745a678b0d3cd72ba42b67d0b6ac6c3872b14229 /kernel/sched/fair.c
parent	2dac754e10a5d41d94d2d2365c0345d4f215a266 (diff)