aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Turner <pjt@google.com>2011-07-21 12:43:32 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-14 06:03:31 -0400
commita9cf55b2861057a213e610da2fec52125439a11d (patch)
tree6c0caf35a6e8fbba7325227f11029f5f4d4cbf7e /kernel
parent58088ad0152ba4b7997388c93d0ca208ec1ece75 (diff)
sched: Expire invalid runtime
Since quota is managed using a global state but consumed on a per-cpu basis we need to ensure that our per-cpu state is appropriately synchronized. Most importantly, runtime that is state (from a previous period) should not be locally consumable. We take advantage of existing sched_clock synchronization about the jiffy to efficiently detect whether we have (globally) crossed a quota boundary above. One catch is that the direction of spread on sched_clock is undefined, specifically, we don't know whether our local clock is behind or ahead of the one responsible for the current expiration time. Fortunately we can differentiate these by considering whether the global deadline has advanced. If it has not, then we assume our clock to be "fast" and advance our local expiration; otherwise, we know the deadline has truly passed and we expire our local runtime. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/sched_fair.c90
2 files changed, 84 insertions, 10 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bf8e6db9af..a2d55144bd9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -256,6 +256,7 @@ struct cfs_bandwidth {
256 ktime_t period; 256 ktime_t period;
257 u64 quota, runtime; 257 u64 quota, runtime;
258 s64 hierarchal_quota; 258 s64 hierarchal_quota;
259 u64 runtime_expires;
259 260
260 int idle, timer_active; 261 int idle, timer_active;
261 struct hrtimer period_timer; 262 struct hrtimer period_timer;
@@ -396,6 +397,7 @@ struct cfs_rq {
396#endif 397#endif
397#ifdef CONFIG_CFS_BANDWIDTH 398#ifdef CONFIG_CFS_BANDWIDTH
398 int runtime_enabled; 399 int runtime_enabled;
400 u64 runtime_expires;
399 s64 runtime_remaining; 401 s64 runtime_remaining;
400#endif 402#endif
401#endif 403#endif
@@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9166 raw_spin_lock_irq(&cfs_b->lock); 9168 raw_spin_lock_irq(&cfs_b->lock);
9167 cfs_b->period = ns_to_ktime(period); 9169 cfs_b->period = ns_to_ktime(period);
9168 cfs_b->quota = quota; 9170 cfs_b->quota = quota;
9169 cfs_b->runtime = quota;
9170 9171
9172 __refill_cfs_bandwidth_runtime(cfs_b);
9171 /* restart the period timer (if active) to handle new period expiry */ 9173 /* restart the period timer (if active) to handle new period expiry */
9172 if (runtime_enabled && cfs_b->timer_active) { 9174 if (runtime_enabled && cfs_b->timer_active) {
9173 /* force a reprogram */ 9175 /* force a reprogram */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index af73a8a85eef..9d1adbd0b615 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1272 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; 1272 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1273} 1273}
1274 1274
1275/*
1276 * Replenish runtime according to assigned quota and update expiration time.
1277 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
1278 * additional synchronization around rq->lock.
1279 *
1280 * requires cfs_b->lock
1281 */
1282static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
1283{
1284 u64 now;
1285
1286 if (cfs_b->quota == RUNTIME_INF)
1287 return;
1288
1289 now = sched_clock_cpu(smp_processor_id());
1290 cfs_b->runtime = cfs_b->quota;
1291 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
1292}
1293
1275static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) 1294static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1276{ 1295{
1277 struct task_group *tg = cfs_rq->tg; 1296 struct task_group *tg = cfs_rq->tg;
1278 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 1297 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1279 u64 amount = 0, min_amount; 1298 u64 amount = 0, min_amount, expires;
1280 1299
1281 /* note: this is a positive sum as runtime_remaining <= 0 */ 1300 /* note: this is a positive sum as runtime_remaining <= 0 */
1282 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; 1301 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1285 if (cfs_b->quota == RUNTIME_INF) 1304 if (cfs_b->quota == RUNTIME_INF)
1286 amount = min_amount; 1305 amount = min_amount;
1287 else { 1306 else {
1288 /* ensure bandwidth timer remains active under consumption */ 1307 /*
1289 if (!cfs_b->timer_active) 1308 * If the bandwidth pool has become inactive, then at least one
1309 * period must have elapsed since the last consumption.
1310 * Refresh the global state and ensure bandwidth timer becomes
1311 * active.
1312 */
1313 if (!cfs_b->timer_active) {
1314 __refill_cfs_bandwidth_runtime(cfs_b);
1290 __start_cfs_bandwidth(cfs_b); 1315 __start_cfs_bandwidth(cfs_b);
1316 }
1291 1317
1292 if (cfs_b->runtime > 0) { 1318 if (cfs_b->runtime > 0) {
1293 amount = min(cfs_b->runtime, min_amount); 1319 amount = min(cfs_b->runtime, min_amount);
@@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1295 cfs_b->idle = 0; 1321 cfs_b->idle = 0;
1296 } 1322 }
1297 } 1323 }
1324 expires = cfs_b->runtime_expires;
1298 raw_spin_unlock(&cfs_b->lock); 1325 raw_spin_unlock(&cfs_b->lock);
1299 1326
1300 cfs_rq->runtime_remaining += amount; 1327 cfs_rq->runtime_remaining += amount;
1328 /*
1329 * we may have advanced our local expiration to account for allowed
1330 * spread between our sched_clock and the one on which runtime was
1331 * issued.
1332 */
1333 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
1334 cfs_rq->runtime_expires = expires;
1301} 1335}
1302 1336
1303static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1337/*
1304 unsigned long delta_exec) 1338 * Note: This depends on the synchronization provided by sched_clock and the
1339 * fact that rq->clock snapshots this value.
1340 */
1341static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1305{ 1342{
1306 if (!cfs_rq->runtime_enabled) 1343 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1344 struct rq *rq = rq_of(cfs_rq);
1345
1346 /* if the deadline is ahead of our clock, nothing to do */
1347 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
1348 return;
1349
1350 if (cfs_rq->runtime_remaining < 0)
1307 return; 1351 return;
1308 1352
1353 /*
1354 * If the local deadline has passed we have to consider the
1355 * possibility that our sched_clock is 'fast' and the global deadline
1356 * has not truly expired.
1357 *
1358 * Fortunately we can check determine whether this the case by checking
1359 * whether the global deadline has advanced.
1360 */
1361
1362 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
1363 /* extend local deadline, drift is bounded above by 2 ticks */
1364 cfs_rq->runtime_expires += TICK_NSEC;
1365 } else {
1366 /* global deadline is ahead, expiration has passed */
1367 cfs_rq->runtime_remaining = 0;
1368 }
1369}
1370
1371static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1372 unsigned long delta_exec)
1373{
1374 /* dock delta_exec before expiring quota (as it could span periods) */
1309 cfs_rq->runtime_remaining -= delta_exec; 1375 cfs_rq->runtime_remaining -= delta_exec;
1310 if (cfs_rq->runtime_remaining > 0) 1376 expire_cfs_rq_runtime(cfs_rq);
1377
1378 if (likely(cfs_rq->runtime_remaining > 0))
1311 return; 1379 return;
1312 1380
1313 assign_cfs_rq_runtime(cfs_rq); 1381 assign_cfs_rq_runtime(cfs_rq);
@@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1338 goto out_unlock; 1406 goto out_unlock;
1339 1407
1340 idle = cfs_b->idle; 1408 idle = cfs_b->idle;
1341 cfs_b->runtime = cfs_b->quota; 1409 /* if we're going inactive then everything else can be deferred */
1410 if (idle)
1411 goto out_unlock;
1412
1413 __refill_cfs_bandwidth_runtime(cfs_b);
1414
1342 1415
1343 /* mark as potentially idle for the upcoming period */ 1416 /* mark as potentially idle for the upcoming period */
1344 cfs_b->idle = 1; 1417 cfs_b->idle = 1;
@@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1557 1630
1558 return wl; 1631 return wl;
1559} 1632}
1560
1561#else 1633#else
1562 1634
1563static inline unsigned long effective_load(struct task_group *tg, int cpu, 1635static inline unsigned long effective_load(struct task_group *tg, int cpu,