diff options
author | Paul Turner <pjt@google.com> | 2012-10-04 07:18:30 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2012-10-24 04:27:22 -0400 |
commit | 9ee474f55664ff63111c843099d365e7ecffb56f (patch) | |
tree | 745a678b0d3cd72ba42b67d0b6ac6c3872b14229 | |
parent | 2dac754e10a5d41d94d2d2365c0345d4f215a266 (diff) |
sched: Maintain the load contribution of blocked entities
We are currently maintaining:
runnable_load(cfs_rq) = \Sum task_load(t)
For all running children t of cfs_rq. While this can be naturally updated for
tasks in a runnable state (as they are scheduled); this does not account for
the load contributed by blocked task entities.
This can be solved by introducing a separate accounting for blocked load:
blocked_load(cfs_rq) = \Sum runnable(b) * weight(b)
Obviously we do not want to iterate over all blocked entities to account for
their decay, we instead observe that:
runnable_load(t) = \Sum p_i*y^i
and that to account for an additional idle period we only need to compute:
y*runnable_load(t).
This means that we can compute all blocked entities at once by evaluating:
blocked_load(cfs_rq)` = y * blocked_load(cfs_rq)
Finally we maintain a decay counter so that when a sleeping entity re-awakens
we can determine how much of its load should be removed from the blocked sum.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Ben Segall <bsegall@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20120823141506.585389902@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/debug.c | 3 | ||||
-rw-r--r-- | kernel/sched/fair.c | 128 | ||||
-rw-r--r-- | kernel/sched/sched.h | 4 |
5 files changed, 122 insertions, 15 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 81d8b1ba4100..b1831accfd89 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1103,6 +1103,7 @@ struct sched_avg { | |||
1103 | */ | 1103 | */ |
1104 | u32 runnable_avg_sum, runnable_avg_period; | 1104 | u32 runnable_avg_sum, runnable_avg_period; |
1105 | u64 last_runnable_update; | 1105 | u64 last_runnable_update; |
1106 | s64 decay_count; | ||
1106 | unsigned long load_avg_contrib; | 1107 | unsigned long load_avg_contrib; |
1107 | }; | 1108 | }; |
1108 | 1109 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fd9d0859350a..00898f1fb69e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1528,7 +1528,6 @@ static void __sched_fork(struct task_struct *p) | |||
1528 | p->se.avg.runnable_avg_period = 0; | 1528 | p->se.avg.runnable_avg_period = 0; |
1529 | p->se.avg.runnable_avg_sum = 0; | 1529 | p->se.avg.runnable_avg_sum = 0; |
1530 | #endif | 1530 | #endif |
1531 | |||
1532 | #ifdef CONFIG_SCHEDSTATS | 1531 | #ifdef CONFIG_SCHEDSTATS |
1533 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 1532 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
1534 | #endif | 1533 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c953a89f94aa..2d2e2b3c1bef 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -95,6 +95,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
95 | P(se->avg.runnable_avg_sum); | 95 | P(se->avg.runnable_avg_sum); |
96 | P(se->avg.runnable_avg_period); | 96 | P(se->avg.runnable_avg_period); |
97 | P(se->avg.load_avg_contrib); | 97 | P(se->avg.load_avg_contrib); |
98 | P(se->avg.decay_count); | ||
98 | #endif | 99 | #endif |
99 | #undef PN | 100 | #undef PN |
100 | #undef P | 101 | #undef P |
@@ -227,6 +228,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
227 | atomic_read(&cfs_rq->tg->load_weight)); | 228 | atomic_read(&cfs_rq->tg->load_weight)); |
228 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", | 229 | SEQ_printf(m, " .%-30s: %lld\n", "runnable_load_avg", |
229 | cfs_rq->runnable_load_avg); | 230 | cfs_rq->runnable_load_avg); |
231 | SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", | ||
232 | cfs_rq->blocked_load_avg); | ||
230 | #endif | 233 | #endif |
231 | 234 | ||
232 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 235 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77af759e5675..83194175e841 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -259,6 +259,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
259 | return grp->my_q; | 259 | return grp->my_q; |
260 | } | 260 | } |
261 | 261 | ||
262 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq); | ||
263 | |||
262 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 264 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
263 | { | 265 | { |
264 | if (!cfs_rq->on_list) { | 266 | if (!cfs_rq->on_list) { |
@@ -278,6 +280,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | |||
278 | } | 280 | } |
279 | 281 | ||
280 | cfs_rq->on_list = 1; | 282 | cfs_rq->on_list = 1; |
283 | /* We should have no load, but we need to update last_decay. */ | ||
284 | update_cfs_rq_blocked_load(cfs_rq); | ||
281 | } | 285 | } |
282 | } | 286 | } |
283 | 287 | ||
@@ -1081,6 +1085,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now, | |||
1081 | return decayed; | 1085 | return decayed; |
1082 | } | 1086 | } |
1083 | 1087 | ||
1088 | /* Synchronize an entity's decay with its parenting cfs_rq.*/ | ||
1089 | static inline void __synchronize_entity_decay(struct sched_entity *se) | ||
1090 | { | ||
1091 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1092 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | ||
1093 | |||
1094 | decays -= se->avg.decay_count; | ||
1095 | if (!decays) | ||
1096 | return; | ||
1097 | |||
1098 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | ||
1099 | se->avg.decay_count = 0; | ||
1100 | } | ||
1101 | |||
1084 | /* Compute the current contribution to load_avg by se, return any delta */ | 1102 | /* Compute the current contribution to load_avg by se, return any delta */ |
1085 | static long __update_entity_load_avg_contrib(struct sched_entity *se) | 1103 | static long __update_entity_load_avg_contrib(struct sched_entity *se) |
1086 | { | 1104 | { |
@@ -1096,8 +1114,18 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) | |||
1096 | return se->avg.load_avg_contrib - old_contrib; | 1114 | return se->avg.load_avg_contrib - old_contrib; |
1097 | } | 1115 | } |
1098 | 1116 | ||
1117 | static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, | ||
1118 | long load_contrib) | ||
1119 | { | ||
1120 | if (likely(load_contrib < cfs_rq->blocked_load_avg)) | ||
1121 | cfs_rq->blocked_load_avg -= load_contrib; | ||
1122 | else | ||
1123 | cfs_rq->blocked_load_avg = 0; | ||
1124 | } | ||
1125 | |||
1099 | /* Update a sched_entity's runnable average */ | 1126 | /* Update a sched_entity's runnable average */ |
1100 | static inline void update_entity_load_avg(struct sched_entity *se) | 1127 | static inline void update_entity_load_avg(struct sched_entity *se, |
1128 | int update_cfs_rq) | ||
1101 | { | 1129 | { |
1102 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1130 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1103 | long contrib_delta; | 1131 | long contrib_delta; |
@@ -1107,8 +1135,34 @@ static inline void update_entity_load_avg(struct sched_entity *se) | |||
1107 | return; | 1135 | return; |
1108 | 1136 | ||
1109 | contrib_delta = __update_entity_load_avg_contrib(se); | 1137 | contrib_delta = __update_entity_load_avg_contrib(se); |
1138 | |||
1139 | if (!update_cfs_rq) | ||
1140 | return; | ||
1141 | |||
1110 | if (se->on_rq) | 1142 | if (se->on_rq) |
1111 | cfs_rq->runnable_load_avg += contrib_delta; | 1143 | cfs_rq->runnable_load_avg += contrib_delta; |
1144 | else | ||
1145 | subtract_blocked_load_contrib(cfs_rq, -contrib_delta); | ||
1146 | } | ||
1147 | |||
1148 | /* | ||
1149 | * Decay the load contributed by all blocked children and account this so that | ||
1150 | * their contribution may appropriately discounted when they wake up. | ||
1151 | */ | ||
1152 | static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) | ||
1153 | { | ||
1154 | u64 now = rq_of(cfs_rq)->clock_task >> 20; | ||
1155 | u64 decays; | ||
1156 | |||
1157 | decays = now - cfs_rq->last_decay; | ||
1158 | if (!decays) | ||
1159 | return; | ||
1160 | |||
1161 | cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, | ||
1162 | decays); | ||
1163 | atomic64_add(decays, &cfs_rq->decay_counter); | ||
1164 | |||
1165 | cfs_rq->last_decay = now; | ||
1112 | } | 1166 | } |
1113 | 1167 | ||
1114 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | 1168 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) |
@@ -1118,26 +1172,53 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable) | |||
1118 | 1172 | ||
1119 | /* Add the load generated by se into cfs_rq's child load-average */ | 1173 | /* Add the load generated by se into cfs_rq's child load-average */ |
1120 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | 1174 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, |
1121 | struct sched_entity *se) | 1175 | struct sched_entity *se, |
1176 | int wakeup) | ||
1122 | { | 1177 | { |
1123 | update_entity_load_avg(se); | 1178 | /* we track migrations using entity decay_count == 0 */ |
1179 | if (unlikely(!se->avg.decay_count)) { | ||
1180 | se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task; | ||
1181 | wakeup = 0; | ||
1182 | } else { | ||
1183 | __synchronize_entity_decay(se); | ||
1184 | } | ||
1185 | |||
1186 | if (wakeup) | ||
1187 | subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); | ||
1188 | |||
1189 | update_entity_load_avg(se, 0); | ||
1124 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; | 1190 | cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; |
1191 | update_cfs_rq_blocked_load(cfs_rq); | ||
1125 | } | 1192 | } |
1126 | 1193 | ||
1127 | /* Remove se's load from this cfs_rq child load-average */ | 1194 | /* |
1195 | * Remove se's load from this cfs_rq child load-average, if the entity is | ||
1196 | * transitioning to a blocked state we track its projected decay using | ||
1197 | * blocked_load_avg. | ||
1198 | */ | ||
1128 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | 1199 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, |
1129 | struct sched_entity *se) | 1200 | struct sched_entity *se, |
1201 | int sleep) | ||
1130 | { | 1202 | { |
1131 | update_entity_load_avg(se); | 1203 | update_entity_load_avg(se, 1); |
1204 | |||
1132 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; | 1205 | cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; |
1206 | if (sleep) { | ||
1207 | cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; | ||
1208 | se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); | ||
1209 | } /* migrations, e.g. sleep=0 leave decay_count == 0 */ | ||
1133 | } | 1210 | } |
1134 | #else | 1211 | #else |
1135 | static inline void update_entity_load_avg(struct sched_entity *se) {} | 1212 | static inline void update_entity_load_avg(struct sched_entity *se, |
1213 | int update_cfs_rq) {} | ||
1136 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} | 1214 | static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} |
1137 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, | 1215 | static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, |
1138 | struct sched_entity *se) {} | 1216 | struct sched_entity *se, |
1217 | int wakeup) {} | ||
1139 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, | 1218 | static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, |
1140 | struct sched_entity *se) {} | 1219 | struct sched_entity *se, |
1220 | int sleep) {} | ||
1221 | static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq) {} | ||
1141 | #endif | 1222 | #endif |
1142 | 1223 | ||
1143 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1224 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -1266,7 +1347,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1266 | */ | 1347 | */ |
1267 | update_curr(cfs_rq); | 1348 | update_curr(cfs_rq); |
1268 | update_cfs_load(cfs_rq, 0); | 1349 | update_cfs_load(cfs_rq, 0); |
1269 | enqueue_entity_load_avg(cfs_rq, se); | 1350 | enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); |
1270 | account_entity_enqueue(cfs_rq, se); | 1351 | account_entity_enqueue(cfs_rq, se); |
1271 | update_cfs_shares(cfs_rq); | 1352 | update_cfs_shares(cfs_rq); |
1272 | 1353 | ||
@@ -1341,7 +1422,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1341 | * Update run-time statistics of the 'current'. | 1422 | * Update run-time statistics of the 'current'. |
1342 | */ | 1423 | */ |
1343 | update_curr(cfs_rq); | 1424 | update_curr(cfs_rq); |
1344 | dequeue_entity_load_avg(cfs_rq, se); | 1425 | dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); |
1345 | 1426 | ||
1346 | update_stats_dequeue(cfs_rq, se); | 1427 | update_stats_dequeue(cfs_rq, se); |
1347 | if (flags & DEQUEUE_SLEEP) { | 1428 | if (flags & DEQUEUE_SLEEP) { |
@@ -1512,7 +1593,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
1512 | /* Put 'current' back into the tree. */ | 1593 | /* Put 'current' back into the tree. */ |
1513 | __enqueue_entity(cfs_rq, prev); | 1594 | __enqueue_entity(cfs_rq, prev); |
1514 | /* in !on_rq case, update occurred at dequeue */ | 1595 | /* in !on_rq case, update occurred at dequeue */ |
1515 | update_entity_load_avg(prev); | 1596 | update_entity_load_avg(prev, 1); |
1516 | } | 1597 | } |
1517 | cfs_rq->curr = NULL; | 1598 | cfs_rq->curr = NULL; |
1518 | } | 1599 | } |
@@ -1528,7 +1609,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
1528 | /* | 1609 | /* |
1529 | * Ensure that runnable average is periodically updated. | 1610 | * Ensure that runnable average is periodically updated. |
1530 | */ | 1611 | */ |
1531 | update_entity_load_avg(curr); | 1612 | update_entity_load_avg(curr, 1); |
1613 | update_cfs_rq_blocked_load(cfs_rq); | ||
1532 | 1614 | ||
1533 | /* | 1615 | /* |
1534 | * Update share accounting for long-running entities. | 1616 | * Update share accounting for long-running entities. |
@@ -2387,6 +2469,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2387 | 2469 | ||
2388 | update_cfs_load(cfs_rq, 0); | 2470 | update_cfs_load(cfs_rq, 0); |
2389 | update_cfs_shares(cfs_rq); | 2471 | update_cfs_shares(cfs_rq); |
2472 | update_entity_load_avg(se, 1); | ||
2390 | } | 2473 | } |
2391 | 2474 | ||
2392 | if (!se) { | 2475 | if (!se) { |
@@ -2448,6 +2531,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
2448 | 2531 | ||
2449 | update_cfs_load(cfs_rq, 0); | 2532 | update_cfs_load(cfs_rq, 0); |
2450 | update_cfs_shares(cfs_rq); | 2533 | update_cfs_shares(cfs_rq); |
2534 | update_entity_load_avg(se, 1); | ||
2451 | } | 2535 | } |
2452 | 2536 | ||
2453 | if (!se) { | 2537 | if (!se) { |
@@ -3498,6 +3582,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) | |||
3498 | 3582 | ||
3499 | update_rq_clock(rq); | 3583 | update_rq_clock(rq); |
3500 | update_cfs_load(cfs_rq, 1); | 3584 | update_cfs_load(cfs_rq, 1); |
3585 | update_cfs_rq_blocked_load(cfs_rq); | ||
3501 | 3586 | ||
3502 | /* | 3587 | /* |
3503 | * We need to update shares after updating tg->load_weight in | 3588 | * We need to update shares after updating tg->load_weight in |
@@ -5232,6 +5317,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) | |||
5232 | place_entity(cfs_rq, se, 0); | 5317 | place_entity(cfs_rq, se, 0); |
5233 | se->vruntime -= cfs_rq->min_vruntime; | 5318 | se->vruntime -= cfs_rq->min_vruntime; |
5234 | } | 5319 | } |
5320 | |||
5321 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5322 | /* | ||
5323 | * Remove our load from contribution when we leave sched_fair | ||
5324 | * and ensure we don't carry in an old decay_count if we | ||
5325 | * switch back. | ||
5326 | */ | ||
5327 | if (p->se.avg.decay_count) { | ||
5328 | struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); | ||
5329 | __synchronize_entity_decay(&p->se); | ||
5330 | subtract_blocked_load_contrib(cfs_rq, | ||
5331 | p->se.avg.load_avg_contrib); | ||
5332 | } | ||
5333 | #endif | ||
5235 | } | 5334 | } |
5236 | 5335 | ||
5237 | /* | 5336 | /* |
@@ -5278,6 +5377,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
5278 | #ifndef CONFIG_64BIT | 5377 | #ifndef CONFIG_64BIT |
5279 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 5378 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
5280 | #endif | 5379 | #endif |
5380 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
5381 | atomic64_set(&cfs_rq->decay_counter, 1); | ||
5382 | #endif | ||
5281 | } | 5383 | } |
5282 | 5384 | ||
5283 | #ifdef CONFIG_FAIR_GROUP_SCHED | 5385 | #ifdef CONFIG_FAIR_GROUP_SCHED |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e6539736af58..664ff39195f7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -229,7 +229,9 @@ struct cfs_rq { | |||
229 | * This allows for the description of both thread and group usage (in | 229 | * This allows for the description of both thread and group usage (in |
230 | * the FAIR_GROUP_SCHED case). | 230 | * the FAIR_GROUP_SCHED case). |
231 | */ | 231 | */ |
232 | u64 runnable_load_avg; | 232 | u64 runnable_load_avg, blocked_load_avg; |
233 | atomic64_t decay_counter; | ||
234 | u64 last_decay; | ||
233 | #endif | 235 | #endif |
234 | #ifdef CONFIG_FAIR_GROUP_SCHED | 236 | #ifdef CONFIG_FAIR_GROUP_SCHED |
235 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 237 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |