diff options
author | Paul Turner <pjt@google.com> | 2012-10-04 07:18:31 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-10-24 04:27:26 -0400 |
commit | bb17f65571e97a7ec0297571fb1154fbd107ad00 (patch) | |
tree | d7574f5a57abe38112cbac832a29354ee1e4fa64 /kernel/sched | |
parent | 8165e145ceb62fc338e099c9b12b3239c83d2f8e (diff) |
sched: Normalize tg load contributions against runnable time
Entities of equal weight should receive equitable distribution of cpu time.
This is challenging in the case of a task_group's shares as execution may be
occurring on multiple cpus simultaneously.
To handle this we divide up the shares into weights proportionate with the load
on each cfs_rq. This does not however, account for the fact that the sum of
the parts may be less than one cpu and so we need to normalize:
load(tg) = min(runnable_avg(tg), 1) * tg->shares
Where runnable_avg is the aggregate time in which the task_group had runnable
children.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120823141506.930124292@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/debug.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 56 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 |
3 files changed, 62 insertions, 0 deletions
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 290892361a09..71b0ea325e93 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -234,6 +234,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
234 | atomic64_read(&cfs_rq->tg->load_avg)); | 234 | atomic64_read(&cfs_rq->tg->load_avg)); |
235 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", | 235 | SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", |
236 | cfs_rq->tg_load_contrib); | 236 | cfs_rq->tg_load_contrib); |
237 | SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", | ||
238 | cfs_rq->tg_runnable_contrib); | ||
239 | SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg", | ||
240 | atomic_read(&cfs_rq->tg->runnable_avg)); | ||
237 | #endif | 241 | #endif |
238 | 242 | ||
239 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 243 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e20cb2693ef7..9e49722da032 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1118,19 +1118,73 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | |||
1118 | } | 1118 | } |
1119 | } | 1119 | } |
1120 | 1120 | ||
1121 | /* | ||
1122 | * Aggregate cfs_rq runnable averages into an equivalent task_group | ||
1123 | * representation for computing load contributions. | ||
1124 | */ | ||
1125 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1126 | struct cfs_rq *cfs_rq) | ||
1127 | { | ||
1128 | struct task_group *tg = cfs_rq->tg; | ||
1129 | long contrib; | ||
1130 | |||
1131 | /* The fraction of a cpu used by this cfs_rq */ | ||
1132 | contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | ||
1133 | sa->runnable_avg_period + 1); | ||
1134 | contrib -= cfs_rq->tg_runnable_contrib; | ||
1135 | |||
1136 | if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { | ||
1137 | atomic_add(contrib, &tg->runnable_avg); | ||
1138 | cfs_rq->tg_runnable_contrib += contrib; | ||
1139 | } | ||
1140 | } | ||
1141 | |||
1121 | static inline void __update_group_entity_contrib(struct sched_entity *se) | 1142 | static inline void __update_group_entity_contrib(struct sched_entity *se) |
1122 | { | 1143 | { |
1123 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | 1144 | struct cfs_rq *cfs_rq = group_cfs_rq(se); |
1124 | struct task_group *tg = cfs_rq->tg; | 1145 | struct task_group *tg = cfs_rq->tg; |
1146 | int runnable_avg; | ||
1147 | |||
1125 | u64 contrib; | 1148 | u64 contrib; |
1126 | 1149 | ||
1127 | contrib = cfs_rq->tg_load_contrib * tg->shares; | 1150 | contrib = cfs_rq->tg_load_contrib * tg->shares; |
1128 | se->avg.load_avg_contrib = div64_u64(contrib, | 1151 | se->avg.load_avg_contrib = div64_u64(contrib, |
1129 | atomic64_read(&tg->load_avg) + 1); | 1152 | atomic64_read(&tg->load_avg) + 1); |
1153 | |||
1154 | /* | ||
1155 | * For group entities we need to compute a correction term in the case | ||
1156 | * that they are consuming <1 cpu so that we would contribute the same | ||
1157 | * load as a task of equal weight. | ||
1158 | * | ||
1159 | * Explicitly co-ordinating this measurement would be expensive, but | ||
1160 | * fortunately the sum of each cpus contribution forms a usable | ||
1161 | * lower-bound on the true value. | ||
1162 | * | ||
1163 | * Consider the aggregate of 2 contributions. Either they are disjoint | ||
1164 | * (and the sum represents true value) or they are disjoint and we are | ||
1165 | * understating by the aggregate of their overlap. | ||
1166 | * | ||
1167 | * Extending this to N cpus, for a given overlap, the maximum amount we | ||
1168 | * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of | ||
1169 | * cpus that overlap for this interval and w_i is the interval width. | ||
1170 | * | ||
1171 | * On a small machine; the first term is well-bounded which bounds the | ||
1172 | * total error since w_i is a subset of the period. Whereas on a | ||
1173 | * larger machine, while this first term can be larger, if w_i is the | ||
1174 | * of consequential size guaranteed to see n_i*w_i quickly converge to | ||
1175 | * our upper bound of 1-cpu. | ||
1176 | */ | ||
1177 | runnable_avg = atomic_read(&tg->runnable_avg); | ||
1178 | if (runnable_avg < NICE_0_LOAD) { | ||
1179 | se->avg.load_avg_contrib *= runnable_avg; | ||
1180 | se->avg.load_avg_contrib >>= NICE_0_SHIFT; | ||
1181 | } | ||
1130 | } | 1182 | } |
1131 | #else | 1183 | #else |
1132 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, | 1184 | static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, |
1133 | int force_update) {} | 1185 | int force_update) {} |
1186 | static inline void __update_tg_runnable_avg(struct sched_avg *sa, | ||
1187 | struct cfs_rq *cfs_rq) {} | ||
1134 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} | 1188 | static inline void __update_group_entity_contrib(struct sched_entity *se) {} |
1135 | #endif | 1189 | #endif |
1136 | 1190 | ||
@@ -1152,6 +1206,7 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
1152 | if (entity_is_task(se)) { | 1206 | if (entity_is_task(se)) { |
1153 | __update_task_entity_contrib(se); | 1207 | __update_task_entity_contrib(se); |
1154 | } else { | 1208 | } else { |
1209 | __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); | ||
1155 | __update_group_entity_contrib(se); | 1210 | __update_group_entity_contrib(se); |
1156 | } | 1211 | } |
1157 | 1212 | ||
@@ -1220,6 +1275,7 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update) | |||
1220 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 1275 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
1221 | { | 1276 | { |
1222 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); | 1277 | __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable); |
1278 | __update_tg_runnable_avg(&rq->avg, &rq->cfs); | ||
1223 | } | 1279 | } |
1224 | 1280 | ||
1225 | /* Add the load generated by se into cfs_rq's child load-average */ | 1281 | /* Add the load generated by se into cfs_rq's child load-average */ |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 924a99094888..134928dc6f05 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -113,6 +113,7 @@ struct task_group { | |||
113 | 113 | ||
114 | atomic_t load_weight; | 114 | atomic_t load_weight; |
115 | atomic64_t load_avg; | 115 | atomic64_t load_avg; |
116 | atomic_t runnable_avg; | ||
116 | #endif | 117 | #endif |
117 | 118 | ||
118 | #ifdef CONFIG_RT_GROUP_SCHED | 119 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -234,6 +235,7 @@ struct cfs_rq { | |||
234 | atomic64_t decay_counter, removed_load; | 235 | atomic64_t decay_counter, removed_load; |
235 | u64 last_decay; | 236 | u64 last_decay; |
236 | #ifdef CONFIG_FAIR_GROUP_SCHED | 237 | #ifdef CONFIG_FAIR_GROUP_SCHED |
238 | u32 tg_runnable_contrib; | ||
237 | u64 tg_load_contrib; | 239 | u64 tg_load_contrib; |
238 | #endif | 240 | #endif |
239 | #endif | 241 | #endif |