sched: Expire invalid runtime

Since quota is managed using a global state but consumed on a per-cpu basis we need to ensure that our per-cpu state is appropriately synchronized. Most importantly, runtime that is state (from a previous period) should not be locally consumable. We take advantage of existing sched_clock synchronization about the jiffy to efficiently detect whether we have (globally) crossed a quota boundary above. One catch is that the direction of spread on sched_clock is undefined, specifically, we don't know whether our local clock is behind or ahead of the one responsible for the current expiration time. Fortunately we can differentiate these by considering whether the global deadline has advanced. If it has not, then we assume our clock to be "fast" and advance our local expiration; otherwise, we know the deadline has truly passed and we expire our local runtime. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Turner <pjt@google.com> 2011-07-21 12:43:32 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-08-14 06:03:31 -0400
commit: a9cf55b2861057a213e610da2fec52125439a11d (patch)
tree: 6c0caf35a6e8fbba7325227f11029f5f4d4cbf7e /kernel
parent: 58088ad0152ba4b7997388c93d0ca208ec1ece75 (diff)
2 files changed, 84 insertions, 10 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 34bf8e6db9af..a2d55144bd9c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -256,6 +256,7 @@ struct cfs_bandwidth {
        ktime_t period;
        u64 quota, runtime;
        s64 hierarchal_quota;
+        u64 runtime_expires;
        int idle, timer_active;
        struct hrtimer period_timer;
@@ -396,6 +397,7 @@ struct cfs_rq {
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
        int runtime_enabled;
+        u64 runtime_expires;
        s64 runtime_remaining;
 #endif
 #endif
@@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
-        cfs_b->runtime = quota;
+        __refill_cfs_bandwidth_runtime(cfs_b);
        /* restart the period timer (if active) to handle new period expiry */
        if (runtime_enabled && cfs_b->timer_active) {
                /* force a reprogram */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index af73a8a85eef..9d1adbd0b615 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
 }
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+        u64 now;
+        if (cfs_b->quota == RUNTIME_INF)
+                return;
+        now = sched_clock_cpu(smp_processor_id());
+        cfs_b->runtime = cfs_b->quota;
+        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
 static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg = cfs_rq->tg;
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
-        u64 amount = 0, min_amount;
+        u64 amount = 0, min_amount, expires;
        /* note: this is a positive sum as runtime_remaining <= 0 */
        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        if (cfs_b->quota == RUNTIME_INF)
                amount = min_amount;
        else {
-                /* ensure bandwidth timer remains active under consumption */
+                /*
-                if (!cfs_b->timer_active)
+                 * If the bandwidth pool has become inactive, then at least one
+                 * period must have elapsed since the last consumption.
+                 * Refresh the global state and ensure bandwidth timer becomes
+                 * active.
+                 */
+                if (!cfs_b->timer_active) {
+                        __refill_cfs_bandwidth_runtime(cfs_b);
                        __start_cfs_bandwidth(cfs_b);
+                }
                if (cfs_b->runtime > 0) {
                        amount = min(cfs_b->runtime, min_amount);
@@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                        cfs_b->idle = 0;
                }
        }
+        expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
        cfs_rq->runtime_remaining += amount;
+        /*
+         * we may have advanced our local expiration to account for allowed
+         * spread between our sched_clock and the one on which runtime was
+         * issued.
+         */
+        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+                cfs_rq->runtime_expires = expires;
 }
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+/*
-                                     unsigned long delta_exec)
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
-        if (!cfs_rq->runtime_enabled)
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct rq *rq = rq_of(cfs_rq);
+        /* if the deadline is ahead of our clock, nothing to do */
+        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+                return;
+        if (cfs_rq->runtime_remaining < 0)
                return;
+        /*
+         * If the local deadline has passed we have to consider the
+         * possibility that our sched_clock is 'fast' and the global deadline
+         * has not truly expired.
+         *
+         * Fortunately we can check determine whether this the case by checking
+         * whether the global deadline has advanced.
+         */
+        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+                /* extend local deadline, drift is bounded above by 2 ticks */
+                cfs_rq->runtime_expires += TICK_NSEC;
+        } else {
+                /* global deadline is ahead, expiration has passed */
+                cfs_rq->runtime_remaining = 0;
+        }
+}
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec)
+{
+        /* dock delta_exec before expiring quota (as it could span periods) */
        cfs_rq->runtime_remaining -= delta_exec;
-        if (cfs_rq->runtime_remaining > 0)
+        expire_cfs_rq_runtime(cfs_rq);
+        if (likely(cfs_rq->runtime_remaining > 0))
                return;
        assign_cfs_rq_runtime(cfs_rq);
@@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
                goto out_unlock;
        idle = cfs_b->idle;
-        cfs_b->runtime = cfs_b->quota;
+        /* if we're going inactive then everything else can be deferred */
+        if (idle)
+                goto out_unlock;
+        __refill_cfs_bandwidth_runtime(cfs_b);
        /* mark as potentially idle for the upcoming period */
        cfs_b->idle = 1;
@@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
        return wl;
 }
 #else
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
author	Paul Turner <pjt@google.com>	2011-07-21 12:43:32 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-08-14 06:03:31 -0400
commit	a9cf55b2861057a213e610da2fec52125439a11d (patch)
tree	6c0caf35a6e8fbba7325227f11029f5f4d4cbf7e /kernel
parent	58088ad0152ba4b7997388c93d0ca208ec1ece75 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 34bf8e6db9af..a2d55144bd9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -256,6 +256,7 @@ struct cfs_bandwidth {
256	ktime_t period;	256	ktime_t period;
257	u64 quota, runtime;	257	u64 quota, runtime;
258	s64 hierarchal_quota;	258	s64 hierarchal_quota;
		259	u64 runtime_expires;
259		260
260	int idle, timer_active;	261	int idle, timer_active;
261	struct hrtimer period_timer;	262	struct hrtimer period_timer;
@@ -396,6 +397,7 @@ struct cfs_rq {
396	#endif	397	#endif
397	#ifdef CONFIG_CFS_BANDWIDTH	398	#ifdef CONFIG_CFS_BANDWIDTH
398	int runtime_enabled;	399	int runtime_enabled;
		400	u64 runtime_expires;
399	s64 runtime_remaining;	401	s64 runtime_remaining;
400	#endif	402	#endif
401	#endif	403	#endif
@@ -9166,8 +9168,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9166	raw_spin_lock_irq(&cfs_b->lock);	9168	raw_spin_lock_irq(&cfs_b->lock);
9167	cfs_b->period = ns_to_ktime(period);	9169	cfs_b->period = ns_to_ktime(period);
9168	cfs_b->quota = quota;	9170	cfs_b->quota = quota;
9169	cfs_b->runtime = quota;
9170		9171
		9172	__refill_cfs_bandwidth_runtime(cfs_b);
9171	/* restart the period timer (if active) to handle new period expiry */	9173	/* restart the period timer (if active) to handle new period expiry */
9172	if (runtime_enabled && cfs_b->timer_active) {	9174	if (runtime_enabled && cfs_b->timer_active) {
9173	/* force a reprogram */	9175	/* force a reprogram */


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index af73a8a85eef..9d1adbd0b615 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1272,11 +1272,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
1272	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;	1272	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
1273	}	1273	}
1274		1274
		1275	/*
		1276	* Replenish runtime according to assigned quota and update expiration time.
		1277	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
		1278	* additional synchronization around rq->lock.
		1279	*
		1280	* requires cfs_b->lock
		1281	*/
		1282	static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
		1283	{
		1284	u64 now;
		1285
		1286	if (cfs_b->quota == RUNTIME_INF)
		1287	return;
		1288
		1289	now = sched_clock_cpu(smp_processor_id());
		1290	cfs_b->runtime = cfs_b->quota;
		1291	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
		1292	}
		1293
1275	static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)	1294	static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1276	{	1295	{
1277	struct task_group *tg = cfs_rq->tg;	1296	struct task_group *tg = cfs_rq->tg;
1278	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);	1297	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
1279	u64 amount = 0, min_amount;	1298	u64 amount = 0, min_amount, expires;
1280		1299
1281	/* note: this is a positive sum as runtime_remaining <= 0 */	1300	/* note: this is a positive sum as runtime_remaining <= 0 */
1282	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;	1301	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -1285,9 +1304,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1285	if (cfs_b->quota == RUNTIME_INF)	1304	if (cfs_b->quota == RUNTIME_INF)
1286	amount = min_amount;	1305	amount = min_amount;
1287	else {	1306	else {
1288	/* ensure bandwidth timer remains active under consumption */	1307	/*
1289	if (!cfs_b->timer_active)	1308	* If the bandwidth pool has become inactive, then at least one
		1309	* period must have elapsed since the last consumption.
		1310	* Refresh the global state and ensure bandwidth timer becomes
		1311	* active.
		1312	*/
		1313	if (!cfs_b->timer_active) {
		1314	__refill_cfs_bandwidth_runtime(cfs_b);
1290	__start_cfs_bandwidth(cfs_b);	1315	__start_cfs_bandwidth(cfs_b);
		1316	}
1291		1317
1292	if (cfs_b->runtime > 0) {	1318	if (cfs_b->runtime > 0) {
1293	amount = min(cfs_b->runtime, min_amount);	1319	amount = min(cfs_b->runtime, min_amount);
@@ -1295,19 +1321,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1295	cfs_b->idle = 0;	1321	cfs_b->idle = 0;
1296	}	1322	}
1297	}	1323	}
		1324	expires = cfs_b->runtime_expires;
1298	raw_spin_unlock(&cfs_b->lock);	1325	raw_spin_unlock(&cfs_b->lock);
1299		1326
1300	cfs_rq->runtime_remaining += amount;	1327	cfs_rq->runtime_remaining += amount;
		1328	/*
		1329	* we may have advanced our local expiration to account for allowed
		1330	* spread between our sched_clock and the one on which runtime was
		1331	* issued.
		1332	*/
		1333	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
		1334	cfs_rq->runtime_expires = expires;
1301	}	1335	}
1302		1336
1303	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	1337	/*
1304	unsigned long delta_exec)	1338	* Note: This depends on the synchronization provided by sched_clock and the
		1339	* fact that rq->clock snapshots this value.
		1340	*/
		1341	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1305	{	1342	{
1306	if (!cfs_rq->runtime_enabled)	1343	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
		1344	struct rq *rq = rq_of(cfs_rq);
		1345
		1346	/* if the deadline is ahead of our clock, nothing to do */
		1347	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
		1348	return;
		1349
		1350	if (cfs_rq->runtime_remaining < 0)
1307	return;	1351	return;
1308		1352
		1353	/*
		1354	* If the local deadline has passed we have to consider the
		1355	* possibility that our sched_clock is 'fast' and the global deadline
		1356	* has not truly expired.
		1357	*
		1358	* Fortunately we can check determine whether this the case by checking
		1359	* whether the global deadline has advanced.
		1360	*/
		1361
		1362	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
		1363	/* extend local deadline, drift is bounded above by 2 ticks */
		1364	cfs_rq->runtime_expires += TICK_NSEC;
		1365	} else {
		1366	/* global deadline is ahead, expiration has passed */
		1367	cfs_rq->runtime_remaining = 0;
		1368	}
		1369	}
		1370
		1371	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
		1372	unsigned long delta_exec)
		1373	{
		1374	/* dock delta_exec before expiring quota (as it could span periods) */
1309	cfs_rq->runtime_remaining -= delta_exec;	1375	cfs_rq->runtime_remaining -= delta_exec;
1310	if (cfs_rq->runtime_remaining > 0)	1376	expire_cfs_rq_runtime(cfs_rq);
		1377
		1378	if (likely(cfs_rq->runtime_remaining > 0))
1311	return;	1379	return;
1312		1380
1313	assign_cfs_rq_runtime(cfs_rq);	1381	assign_cfs_rq_runtime(cfs_rq);
@@ -1338,7 +1406,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
1338	goto out_unlock;	1406	goto out_unlock;
1339		1407
1340	idle = cfs_b->idle;	1408	idle = cfs_b->idle;
1341	cfs_b->runtime = cfs_b->quota;	1409	/* if we're going inactive then everything else can be deferred */
		1410	if (idle)
		1411	goto out_unlock;
		1412
		1413	__refill_cfs_bandwidth_runtime(cfs_b);
		1414
1342		1415
1343	/* mark as potentially idle for the upcoming period */	1416	/* mark as potentially idle for the upcoming period */
1344	cfs_b->idle = 1;	1417	cfs_b->idle = 1;
@@ -1557,7 +1630,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1557		1630
1558	return wl;	1631	return wl;
1559	}	1632	}
1560
1561	#else	1633	#else
1562		1634
1563	static inline unsigned long effective_load(struct task_group *tg, int cpu,	1635	static inline unsigned long effective_load(struct task_group *tg, int cpu,