aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorPaul Turner <pjt@google.com>2012-10-04 07:18:30 -0400
committerIngo Molnar <mingo@kernel.org>2012-10-24 04:27:23 -0400
commitaff3e498844441fa71c5ee1bbc470e1dff9548d9 (patch)
tree78085232ff0200ad8247d1948bbe6131b6f504ab /kernel/sched
parent0a74bef8bed18dc6889e9bc37ea1050a50c86c89 (diff)
sched: Account for blocked load waking back up
When a running entity blocks we migrate its tracked load to cfs_rq->blocked_runnable_avg. In the sleep case this occurs while holding rq->lock and so is a natural transition. Wake-ups however, are potentially asynchronous in the presence of migration and so special care must be taken. We use an atomic counter to track such migrated load, taking care to match this with the previously introduced decay counters so that we don't migrate too much load. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Ben Segall <bsegall@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20120823141506.726077467@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c100
-rw-r--r--kernel/sched/sched.h2
2 files changed, 81 insertions, 21 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e602e6ba0c3..74dc29ba1ad1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -259,7 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
259 return grp->my_q; 259 return grp->my_q;
260} 260}
261 261
262static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); 262static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
263 int force_update);
263 264
264static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 265static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
265{ 266{
@@ -281,7 +282,7 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
281 282
282 cfs_rq->on_list = 1; 283 cfs_rq->on_list = 1;
283 /* We should have no load, but we need to update last_decay. */ 284 /* We should have no load, but we need to update last_decay. */
284 update_cfs_rq_blocked_load(cfs_rq); 285 update_cfs_rq_blocked_load(cfs_rq, 0);
285 } 286 }
286} 287}
287 288
@@ -1086,17 +1087,19 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
1086} 1087}
1087 1088
1088/* Synchronize an entity's decay with its parenting cfs_rq.*/ 1089/* Synchronize an entity's decay with its parenting cfs_rq.*/
1089static inline void __synchronize_entity_decay(struct sched_entity *se) 1090static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1090{ 1091{
1091 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1092 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1092 u64 decays = atomic64_read(&cfs_rq->decay_counter); 1093 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1093 1094
1094 decays -= se->avg.decay_count; 1095 decays -= se->avg.decay_count;
1095 if (!decays) 1096 if (!decays)
1096 return; 1097 return 0;
1097 1098
1098 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 1099 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1099 se->avg.decay_count = 0; 1100 se->avg.decay_count = 0;
1101
1102 return decays;
1100} 1103}
1101 1104
1102/* Compute the current contribution to load_avg by se, return any delta */ 1105/* Compute the current contribution to load_avg by se, return any delta */
@@ -1149,20 +1152,26 @@ static inline void update_entity_load_avg(struct sched_entity *se,
1149 * Decay the load contributed by all blocked children and account this so that 1152 * Decay the load contributed by all blocked children and account this so that
1150 * their contribution may appropriately discounted when they wake up. 1153 * their contribution may appropriately discounted when they wake up.
1151 */ 1154 */
1152static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) 1155static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1153{ 1156{
1154 u64 now = rq_of(cfs_rq)->clock_task >> 20; 1157 u64 now = rq_of(cfs_rq)->clock_task >> 20;
1155 u64 decays; 1158 u64 decays;
1156 1159
1157 decays = now - cfs_rq->last_decay; 1160 decays = now - cfs_rq->last_decay;
1158 if (!decays) 1161 if (!decays && !force_update)
1159 return; 1162 return;
1160 1163
1161 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, 1164 if (atomic64_read(&cfs_rq->removed_load)) {
1162 decays); 1165 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1163 atomic64_add(decays, &cfs_rq->decay_counter); 1166 subtract_blocked_load_contrib(cfs_rq, removed_load);
1167 }
1164 1168
1165 cfs_rq->last_decay = now; 1169 if (decays) {
1170 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1171 decays);
1172 atomic64_add(decays, &cfs_rq->decay_counter);
1173 cfs_rq->last_decay = now;
1174 }
1166} 1175}
1167 1176
1168static inline void update_rq_runnable_avg(struct rq *rq, int runnable) 1177static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
@@ -1175,20 +1184,42 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1175 struct sched_entity *se, 1184 struct sched_entity *se,
1176 int wakeup) 1185 int wakeup)
1177{ 1186{
1178 /* we track migrations using entity decay_count == 0 */ 1187 /*
1179 if (unlikely(!se->avg.decay_count)) { 1188 * We track migrations using entity decay_count <= 0, on a wake-up
1189 * migration we use a negative decay count to track the remote decays
1190 * accumulated while sleeping.
1191 */
1192 if (unlikely(se->avg.decay_count <= 0)) {
1180 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; 1193 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1194 if (se->avg.decay_count) {
1195 /*
1196 * In a wake-up migration we have to approximate the
1197 * time sleeping. This is because we can't synchronize
1198 * clock_task between the two cpus, and it is not
1199 * guaranteed to be read-safe. Instead, we can
1200 * approximate this using our carried decays, which are
1201 * explicitly atomically readable.
1202 */
1203 se->avg.last_runnable_update -= (-se->avg.decay_count)
1204 << 20;
1205 update_entity_load_avg(se, 0);
1206 /* Indicate that we're now synchronized and on-rq */
1207 se->avg.decay_count = 0;
1208 }
1181 wakeup = 0; 1209 wakeup = 0;
1182 } else { 1210 } else {
1183 __synchronize_entity_decay(se); 1211 __synchronize_entity_decay(se);
1184 } 1212 }
1185 1213
1186 if (wakeup) 1214 /* migrated tasks did not contribute to our blocked load */
1215 if (wakeup) {
1187 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); 1216 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1217 update_entity_load_avg(se, 0);
1218 }
1188 1219
1189 update_entity_load_avg(se, 0);
1190 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; 1220 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1191 update_cfs_rq_blocked_load(cfs_rq); 1221 /* we force update consideration on load-balancer moves */
1222 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1192} 1223}
1193 1224
1194/* 1225/*
@@ -1201,6 +1232,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1201 int sleep) 1232 int sleep)
1202{ 1233{
1203 update_entity_load_avg(se, 1); 1234 update_entity_load_avg(se, 1);
1235 /* we force update consideration on load-balancer moves */
1236 update_cfs_rq_blocked_load(cfs_rq, !sleep);
1204 1237
1205 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; 1238 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1206 if (sleep) { 1239 if (sleep) {
@@ -1218,7 +1251,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1218static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, 1251static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1219 struct sched_entity *se, 1252 struct sched_entity *se,
1220 int sleep) {} 1253 int sleep) {}
1221static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} 1254static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1255 int force_update) {}
1222#endif 1256#endif
1223 1257
1224static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 1258static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1610,7 +1644,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1610 * Ensure that runnable average is periodically updated. 1644 * Ensure that runnable average is periodically updated.
1611 */ 1645 */
1612 update_entity_load_avg(curr, 1); 1646 update_entity_load_avg(curr, 1);
1613 update_cfs_rq_blocked_load(cfs_rq); 1647 update_cfs_rq_blocked_load(cfs_rq, 1);
1614 1648
1615 /* 1649 /*
1616 * Update share accounting for long-running entities. 1650 * Update share accounting for long-running entities.
@@ -3057,6 +3091,19 @@ unlock:
3057static void 3091static void
3058migrate_task_rq_fair(struct task_struct *p, int next_cpu) 3092migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3059{ 3093{
3094 struct sched_entity *se = &p->se;
3095 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3096
3097 /*
3098 * Load tracking: accumulate removed load so that it can be processed
3099 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3100 * to blocked load iff they have a positive decay-count. It can never
3101 * be negative here since on-rq tasks have decay-count == 0.
3102 */
3103 if (se->avg.decay_count) {
3104 se->avg.decay_count = -__synchronize_entity_decay(se);
3105 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3106 }
3060} 3107}
3061#endif /* CONFIG_SMP */ 3108#endif /* CONFIG_SMP */
3062 3109
@@ -3593,7 +3640,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
3593 3640
3594 update_rq_clock(rq); 3641 update_rq_clock(rq);
3595 update_cfs_load(cfs_rq, 1); 3642 update_cfs_load(cfs_rq, 1);
3596 update_cfs_rq_blocked_load(cfs_rq); 3643 update_cfs_rq_blocked_load(cfs_rq, 1);
3597 3644
3598 /* 3645 /*
3599 * We need to update shares after updating tg->load_weight in 3646 * We need to update shares after updating tg->load_weight in
@@ -5390,12 +5437,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
5390#endif 5437#endif
5391#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) 5438#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5392 atomic64_set(&cfs_rq->decay_counter, 1); 5439 atomic64_set(&cfs_rq->decay_counter, 1);
5440 atomic64_set(&cfs_rq->removed_load, 0);
5393#endif 5441#endif
5394} 5442}
5395 5443
5396#ifdef CONFIG_FAIR_GROUP_SCHED 5444#ifdef CONFIG_FAIR_GROUP_SCHED
5397static void task_move_group_fair(struct task_struct *p, int on_rq) 5445static void task_move_group_fair(struct task_struct *p, int on_rq)
5398{ 5446{
5447 struct cfs_rq *cfs_rq;
5399 /* 5448 /*
5400 * If the task was not on the rq at the time of this cgroup movement 5449 * If the task was not on the rq at the time of this cgroup movement
5401 * it must have been asleep, sleeping tasks keep their ->vruntime 5450 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5427,8 +5476,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
5427 if (!on_rq) 5476 if (!on_rq)
5428 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; 5477 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5429 set_task_rq(p, task_cpu(p)); 5478 set_task_rq(p, task_cpu(p));
5430 if (!on_rq) 5479 if (!on_rq) {
5431 p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; 5480 cfs_rq = cfs_rq_of(&p->se);
5481 p->se.vruntime += cfs_rq->min_vruntime;
5482#ifdef CONFIG_SMP
5483 /*
5484 * migrate_task_rq_fair() will have removed our previous
5485 * contribution, but we must synchronize for ongoing future
5486 * decay.
5487 */
5488 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5489 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5490#endif
5491 }
5432} 5492}
5433 5493
5434void free_fair_sched_group(struct task_group *tg) 5494void free_fair_sched_group(struct task_group *tg)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 664ff39195f7..30236ab4edc0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -230,7 +230,7 @@ struct cfs_rq {
230 * the FAIR_GROUP_SCHED case). 230 * the FAIR_GROUP_SCHED case).
231 */ 231 */
232 u64 runnable_load_avg, blocked_load_avg; 232 u64 runnable_load_avg, blocked_load_avg;
233 atomic64_t decay_counter; 233 atomic64_t decay_counter, removed_load;
234 u64 last_decay; 234 u64 last_decay;
235#endif 235#endif
236#ifdef CONFIG_FAIR_GROUP_SCHED 236#ifdef CONFIG_FAIR_GROUP_SCHED