sched: Normalize tg load contributions against runnable time

Entities of equal weight should receive equitable distribution of cpu time. This is challenging in the case of a task_group's shares as execution may be occurring on multiple cpus simultaneously. To handle this we divide up the shares into weights proportionate with the load on each cfs_rq. This does not however, account for the fact that the sum of the parts may be less than one cpu and so we need to normalize: load(tg) = min(runnable_avg(tg), 1) * tg->shares Where runnable_avg is the aggregate time in which the task_group had runnable children. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Ben Segall <bsegall@google.com>. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Paul Turner <pjt@google.com> 2012-10-04 07:18:31 -0400
committer: Ingo Molnar <mingo@kernel.org> 2012-10-24 04:27:26 -0400
commit: bb17f65571e97a7ec0297571fb1154fbd107ad00 (patch)
tree: d7574f5a57abe38112cbac832a29354ee1e4fa64 /kernel/sched
parent: 8165e145ceb62fc338e099c9b12b3239c83d2f8e (diff)
3 files changed, 62 insertions, 0 deletions
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 290892361a09..71b0ea325e93 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        atomic64_read(&cfs_rq->tg->load_avg));
        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
+        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
+                        cfs_rq->tg_runnable_contrib);
+        SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
+                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e20cb2693ef7..9e49722da032 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
        }
 }
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                  struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        long contrib;
+        /* The fraction of a cpu used by this cfs_rq */
+        contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+                          sa->runnable_avg_period + 1);
+        contrib -= cfs_rq->tg_runnable_contrib;
+        if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+                atomic_add(contrib, &tg->runnable_avg);
+                cfs_rq->tg_runnable_contrib += contrib;
+        }
+}
 static inline void __update_group_entity_contrib(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = group_cfs_rq(se);
        struct task_group *tg = cfs_rq->tg;
+        int runnable_avg;
        u64 contrib;
        contrib = cfs_rq->tg_load_contrib * tg->shares;
        se->avg.load_avg_contrib = div64_u64(contrib,
                                             atomic64_read(&tg->load_avg) + 1);
+        /*
+         * For group entities we need to compute a correction term in the case
+         * that they are consuming <1 cpu so that we would contribute the same
+         * load as a task of equal weight.
+         *
+         * Explicitly co-ordinating this measurement would be expensive, but
+         * fortunately the sum of each cpus contribution forms a usable
+         * lower-bound on the true value.
+         *
+         * Consider the aggregate of 2 contributions.  Either they are disjoint
+         * (and the sum represents true value) or they are disjoint and we are
+         * understating by the aggregate of their overlap.
+         *
+         * Extending this to N cpus, for a given overlap, the maximum amount we
+         * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+         * cpus that overlap for this interval and w_i is the interval width.
+         *
+         * On a small machine; the first term is well-bounded which bounds the
+         * total error since w_i is a subset of the period.  Whereas on a
+         * larger machine, while this first term can be larger, if w_i is the
+         * of consequential size guaranteed to see n_i*w_i quickly converge to
+         * our upper bound of 1-cpu.
+         */
+        runnable_avg = atomic_read(&tg->runnable_avg);
+        if (runnable_avg < NICE_0_LOAD) {
+                se->avg.load_avg_contrib *= runnable_avg;
+                se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+        }
 }
 #else
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
                                                 int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
 #endif
@@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
        if (entity_is_task(se)) {
                __update_task_entity_contrib(se);
        } else {
+                __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
                __update_group_entity_contrib(se);
        }
@@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
        __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
 }
 /* Add the load generated by se into cfs_rq's child load-average */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 924a99094888..134928dc6f05 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -113,6 +113,7 @@ struct task_group {
        atomic_t load_weight;
        atomic64_t load_avg;
+        atomic_t runnable_avg;
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
        atomic64_t decay_counter, removed_load;
        u64 last_decay;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+        u32 tg_runnable_contrib;
        u64 tg_load_contrib;
 #endif
 #endif
author	Paul Turner <pjt@google.com>	2012-10-04 07:18:31 -0400
committer	Ingo Molnar <mingo@kernel.org>	2012-10-24 04:27:26 -0400
commit	bb17f65571e97a7ec0297571fb1154fbd107ad00 (patch)
tree	d7574f5a57abe38112cbac832a29354ee1e4fa64 /kernel/sched
parent	8165e145ceb62fc338e099c9b12b3239c83d2f8e (diff)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 290892361a09..71b0ea325e93 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file m, int cpu, struct cfs_rq cfs_rq)
234	atomic64_read(&cfs_rq->tg->load_avg));	234	atomic64_read(&cfs_rq->tg->load_avg));
235	SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",	235	SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
236	cfs_rq->tg_load_contrib);	236	cfs_rq->tg_load_contrib);
		237	SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
		238	cfs_rq->tg_runnable_contrib);
		239	SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
		240	atomic_read(&cfs_rq->tg->runnable_avg));
237	#endif	241	#endif
238		242
239	print_cfs_group_stats(m, cpu, cfs_rq->tg);	243	print_cfs_group_stats(m, cpu, cfs_rq->tg);


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e20cb2693ef7..9e49722da032 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1118	}	1118	}
1119	}	1119	}
1120		1120
		1121	/*
		1122	* Aggregate cfs_rq runnable averages into an equivalent task_group
		1123	* representation for computing load contributions.
		1124	*/
		1125	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
		1126	struct cfs_rq *cfs_rq)
		1127	{
		1128	struct task_group *tg = cfs_rq->tg;
		1129	long contrib;
		1130
		1131	/* The fraction of a cpu used by this cfs_rq */
		1132	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
		1133	sa->runnable_avg_period + 1);
		1134	contrib -= cfs_rq->tg_runnable_contrib;
		1135
		1136	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
		1137	atomic_add(contrib, &tg->runnable_avg);
		1138	cfs_rq->tg_runnable_contrib += contrib;
		1139	}
		1140	}
		1141
1121	static inline void __update_group_entity_contrib(struct sched_entity *se)	1142	static inline void __update_group_entity_contrib(struct sched_entity *se)
1122	{	1143	{
1123	struct cfs_rq *cfs_rq = group_cfs_rq(se);	1144	struct cfs_rq *cfs_rq = group_cfs_rq(se);
1124	struct task_group *tg = cfs_rq->tg;	1145	struct task_group *tg = cfs_rq->tg;
		1146	int runnable_avg;
		1147
1125	u64 contrib;	1148	u64 contrib;
1126		1149
1127	contrib = cfs_rq->tg_load_contrib * tg->shares;	1150	contrib = cfs_rq->tg_load_contrib * tg->shares;
1128	se->avg.load_avg_contrib = div64_u64(contrib,	1151	se->avg.load_avg_contrib = div64_u64(contrib,
1129	atomic64_read(&tg->load_avg) + 1);	1152	atomic64_read(&tg->load_avg) + 1);
		1153
		1154	/*
		1155	* For group entities we need to compute a correction term in the case
		1156	* that they are consuming <1 cpu so that we would contribute the same
		1157	* load as a task of equal weight.
		1158	*
		1159	* Explicitly co-ordinating this measurement would be expensive, but
		1160	* fortunately the sum of each cpus contribution forms a usable
		1161	* lower-bound on the true value.
		1162	*
		1163	* Consider the aggregate of 2 contributions. Either they are disjoint
		1164	* (and the sum represents true value) or they are disjoint and we are
		1165	* understating by the aggregate of their overlap.
		1166	*
		1167	* Extending this to N cpus, for a given overlap, the maximum amount we
		1168	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
		1169	* cpus that overlap for this interval and w_i is the interval width.
		1170	*
		1171	* On a small machine; the first term is well-bounded which bounds the
		1172	* total error since w_i is a subset of the period. Whereas on a
		1173	* larger machine, while this first term can be larger, if w_i is the
		1174	* of consequential size guaranteed to see n_i*w_i quickly converge to
		1175	* our upper bound of 1-cpu.
		1176	*/
		1177	runnable_avg = atomic_read(&tg->runnable_avg);
		1178	if (runnable_avg < NICE_0_LOAD) {
		1179	se->avg.load_avg_contrib *= runnable_avg;
		1180	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
		1181	}
1130	}	1182	}
1131	#else	1183	#else
1132	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,	1184	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1133	int force_update) {}	1185	int force_update) {}
		1186	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
		1187	struct cfs_rq *cfs_rq) {}
1134	static inline void __update_group_entity_contrib(struct sched_entity *se) {}	1188	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1135	#endif	1189	#endif
1136		1190
@@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
1152	if (entity_is_task(se)) {	1206	if (entity_is_task(se)) {
1153	__update_task_entity_contrib(se);	1207	__update_task_entity_contrib(se);
1154	} else {	1208	} else {
		1209	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1155	__update_group_entity_contrib(se);	1210	__update_group_entity_contrib(se);
1156	}	1211	}
1157		1212
@@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1220	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)	1275	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1221	{	1276	{
1222	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);	1277	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
		1278	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
1223	}	1279	}
1224		1280
1225	/* Add the load generated by se into cfs_rq's child load-average */	1281	/* Add the load generated by se into cfs_rq's child load-average */


diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 924a99094888..134928dc6f05 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h
@@ -113,6 +113,7 @@ struct task_group {
113		113
114	atomic_t load_weight;	114	atomic_t load_weight;
115	atomic64_t load_avg;	115	atomic64_t load_avg;
		116	atomic_t runnable_avg;
116	#endif	117	#endif
117		118
118	#ifdef CONFIG_RT_GROUP_SCHED	119	#ifdef CONFIG_RT_GROUP_SCHED
@@ -234,6 +235,7 @@ struct cfs_rq {
234	atomic64_t decay_counter, removed_load;	235	atomic64_t decay_counter, removed_load;
235	u64 last_decay;	236	u64 last_decay;
236	#ifdef CONFIG_FAIR_GROUP_SCHED	237	#ifdef CONFIG_FAIR_GROUP_SCHED
		238	u32 tg_runnable_contrib;
237	u64 tg_load_contrib;	239	u64 tg_load_contrib;
238	#endif	240	#endif
239	#endif	241	#endif