sched: Add a timer to handle CFS bandwidth refresh

This patch adds a per-task_group timer which handles the refresh of the global CFS bandwidth pool. Since the RT pool is using a similar timer there's some small refactoring to share this support. Signed-off-by: Paul Turner <pjt@google.com> Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Turner <pjt@google.com> 2011-07-21 12:43:31 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-08-14 06:03:28 -0400
commit: 58088ad0152ba4b7997388c93d0ca208ec1ece75 (patch)
tree: 22d818b745056effc53ee6fa97ee9103548766b5 /kernel
parent: ec12cb7f31e28854efae7dd6f9544e0a66379040 (diff)
2 files changed, 123 insertions, 24 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 35561c63a490..34bf8e6db9af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-        ktime_t now;
+        unsigned long delta;
+        ktime_t soft, hard, now;
+        for (;;) {
+                if (hrtimer_active(period_timer))
+                        break;
+                now = hrtimer_cb_get_time(period_timer);
+                hrtimer_forward(period_timer, now, period);
+                soft = hrtimer_get_softexpires(period_timer);
+                hard = hrtimer_get_expires(period_timer);
+                delta = ktime_to_ns(ktime_sub(hard, soft));
+                __hrtimer_start_range_ns(period_timer, soft, delta,
+                                         HRTIMER_MODE_ABS_PINNED, 0);
+        }
+}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                return;
        raw_spin_lock(&rt_b->rt_runtime_lock);
-        for (;;) {
+        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-                unsigned long delta;
-                ktime_t soft, hard;
-                if (hrtimer_active(&rt_b->rt_period_timer))
-                        break;
-                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-                delta = ktime_to_ns(ktime_sub(hard, soft));
-                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS_PINNED, 0);
-        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -253,6 +256,9 @@ struct cfs_bandwidth {
        ktime_t period;
        u64 quota, runtime;
        s64 hierarchal_quota;
+        int idle, timer_active;
+        struct hrtimer period_timer;
 #endif
 };
@@ -403,6 +409,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 }
 static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, cfs_b->period);
+                if (!overrun)
+                        break;
+                idle = do_sched_cfs_period_timer(cfs_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
@@ -410,6 +438,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        cfs_b->runtime = 0;
        cfs_b->quota = RUNTIME_INF;
        cfs_b->period = ns_to_ktime(default_cfs_period());
+        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->period_timer.function = sched_cfs_period_timer;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -417,8 +448,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        cfs_rq->runtime_enabled = 0;
 }
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        /*
+         * The timer may be active because we're trying to set a new bandwidth
+         * period or because we're racing with the tear-down path
+         * (timer_active==0 becomes visible before the hrtimer call-back
+         * terminates).  In either case we ensure that it's re-programmed
+         */
+        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* ensure cfs_b->lock is available while we wait */
+                hrtimer_cancel(&cfs_b->period_timer);
+                raw_spin_lock(&cfs_b->lock);
+                /* if someone else restarted the timer then we're done */
+                if (cfs_b->timer_active)
+                        return;
+        }
+        cfs_b->timer_active = 1;
+        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{}
+{
+        hrtimer_cancel(&cfs_b->period_timer);
+}
 #else
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9078,7 +9135,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-        int i, ret = 0;
+        int i, ret = 0, runtime_enabled;
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
        if (tg == &root_task_group)
@@ -9105,10 +9162,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        if (ret)
                goto out_unlock;
+        runtime_enabled = quota != RUNTIME_INF;
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
        cfs_b->runtime = quota;
+        /* restart the period timer (if active) to handle new period expiry */
+        if (runtime_enabled && cfs_b->timer_active) {
+                /* force a reprogram */
+                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b);
+        }
        raw_spin_unlock_irq(&cfs_b->lock);
        for_each_possible_cpu(i) {
@@ -9116,7 +9181,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                struct rq *rq = rq_of(cfs_rq);
                raw_spin_lock_irq(&rq->lock);
-                cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
                raw_spin_unlock_irq(&rq->lock);
        }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9502aa899f73..af73a8a85eef 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1284,9 +1284,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota == RUNTIME_INF)
                amount = min_amount;
-        else if (cfs_b->runtime > 0) {
+        else {
-                amount = min(cfs_b->runtime, min_amount);
+                /* ensure bandwidth timer remains active under consumption */
-                cfs_b->runtime -= amount;
+                if (!cfs_b->timer_active)
+                        __start_cfs_bandwidth(cfs_b);
+                if (cfs_b->runtime > 0) {
+                        amount = min(cfs_b->runtime, min_amount);
+                        cfs_b->runtime -= amount;
+                        cfs_b->idle = 0;
+                }
        }
        raw_spin_unlock(&cfs_b->lock);
@@ -1315,6 +1322,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
        __account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+        int idle = 1;
+        raw_spin_lock(&cfs_b->lock);
+        /* no need to continue the timer with no bandwidth constraint */
+        if (cfs_b->quota == RUNTIME_INF)
+                goto out_unlock;
+        idle = cfs_b->idle;
+        cfs_b->runtime = cfs_b->quota;
+        /* mark as potentially idle for the upcoming period */
+        cfs_b->idle = 1;
+out_unlock:
+        if (idle)
+                cfs_b->timer_active = 0;
+        raw_spin_unlock(&cfs_b->lock);
+        return idle;
+}
 #else
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                     unsigned long delta_exec) {}
author	Paul Turner <pjt@google.com>	2011-07-21 12:43:31 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-08-14 06:03:28 -0400
commit	58088ad0152ba4b7997388c93d0ca208ec1ece75 (patch)
tree	22d818b745056effc53ee6fa97ee9103548766b5 /kernel
parent	ec12cb7f31e28854efae7dd6f9544e0a66379040 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 35561c63a490..34bf8e6db9af 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
196	return sysctl_sched_rt_runtime >= 0;	196	return sysctl_sched_rt_runtime >= 0;
197	}	197	}
198		198
199	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)	199	static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
200	{	200	{
201	ktime_t now;	201	unsigned long delta;
		202	ktime_t soft, hard, now;
202		203
		204	for (;;) {
		205	if (hrtimer_active(period_timer))
		206	break;
		207
		208	now = hrtimer_cb_get_time(period_timer);
		209	hrtimer_forward(period_timer, now, period);
		210
		211	soft = hrtimer_get_softexpires(period_timer);
		212	hard = hrtimer_get_expires(period_timer);
		213	delta = ktime_to_ns(ktime_sub(hard, soft));
		214	__hrtimer_start_range_ns(period_timer, soft, delta,
		215	HRTIMER_MODE_ABS_PINNED, 0);
		216	}
		217	}
		218
		219	static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
		220	{
203	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)	221	if (!rt_bandwidth_enabled() \|\| rt_b->rt_runtime == RUNTIME_INF)
204	return;	222	return;
205		223
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
207	return;	225	return;
208		226
209	raw_spin_lock(&rt_b->rt_runtime_lock);	227	raw_spin_lock(&rt_b->rt_runtime_lock);
210	for (;;) {	228	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
211	unsigned long delta;
212	ktime_t soft, hard;
213
214	if (hrtimer_active(&rt_b->rt_period_timer))
215	break;
216
217	now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
218	hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
219
220	soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
221	hard = hrtimer_get_expires(&rt_b->rt_period_timer);
222	delta = ktime_to_ns(ktime_sub(hard, soft));
223	__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
224	HRTIMER_MODE_ABS_PINNED, 0);
225	}
226	raw_spin_unlock(&rt_b->rt_runtime_lock);	229	raw_spin_unlock(&rt_b->rt_runtime_lock);
227	}	230	}
228		231
@@ -253,6 +256,9 @@ struct cfs_bandwidth {
253	ktime_t period;	256	ktime_t period;
254	u64 quota, runtime;	257	u64 quota, runtime;
255	s64 hierarchal_quota;	258	s64 hierarchal_quota;
		259
		260	int idle, timer_active;
		261	struct hrtimer period_timer;
256	#endif	262	#endif
257	};	263	};
258		264
@@ -403,6 +409,28 @@ static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
403	}	409	}
404		410
405	static inline u64 default_cfs_period(void);	411	static inline u64 default_cfs_period(void);
		412	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
		413
		414	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
		415	{
		416	struct cfs_bandwidth *cfs_b =
		417	container_of(timer, struct cfs_bandwidth, period_timer);
		418	ktime_t now;
		419	int overrun;
		420	int idle = 0;
		421
		422	for (;;) {
		423	now = hrtimer_cb_get_time(timer);
		424	overrun = hrtimer_forward(timer, now, cfs_b->period);
		425
		426	if (!overrun)
		427	break;
		428
		429	idle = do_sched_cfs_period_timer(cfs_b, overrun);
		430	}
		431
		432	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
		433	}
406		434
407	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	435	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
408	{	436	{
@@ -410,6 +438,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
410	cfs_b->runtime = 0;	438	cfs_b->runtime = 0;
411	cfs_b->quota = RUNTIME_INF;	439	cfs_b->quota = RUNTIME_INF;
412	cfs_b->period = ns_to_ktime(default_cfs_period());	440	cfs_b->period = ns_to_ktime(default_cfs_period());
		441
		442	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
		443	cfs_b->period_timer.function = sched_cfs_period_timer;
413	}	444	}
414		445
415	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)	446	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -417,8 +448,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
417	cfs_rq->runtime_enabled = 0;	448	cfs_rq->runtime_enabled = 0;
418	}	449	}
419		450
		451	/* requires cfs_b->lock, may release to reprogram timer */
		452	static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
		453	{
		454	/*
		455	* The timer may be active because we're trying to set a new bandwidth
		456	* period or because we're racing with the tear-down path
		457	* (timer_active==0 becomes visible before the hrtimer call-back
		458	* terminates). In either case we ensure that it's re-programmed
		459	*/
		460	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
		461	raw_spin_unlock(&cfs_b->lock);
		462	/* ensure cfs_b->lock is available while we wait */
		463	hrtimer_cancel(&cfs_b->period_timer);
		464
		465	raw_spin_lock(&cfs_b->lock);
		466	/* if someone else restarted the timer then we're done */
		467	if (cfs_b->timer_active)
		468	return;
		469	}
		470
		471	cfs_b->timer_active = 1;
		472	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
		473	}
		474
420	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	475	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
421	{}	476	{
		477	hrtimer_cancel(&cfs_b->period_timer);
		478	}
422	#else	479	#else
423	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	480	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
424	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}	481	static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9078,7 +9135,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9078		9135
9079	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)	9136	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9080	{	9137	{
9081	int i, ret = 0;	9138	int i, ret = 0, runtime_enabled;
9082	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);	9139	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9083		9140
9084	if (tg == &root_task_group)	9141	if (tg == &root_task_group)
@@ -9105,10 +9162,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9105	if (ret)	9162	if (ret)
9106	goto out_unlock;	9163	goto out_unlock;
9107		9164
		9165	runtime_enabled = quota != RUNTIME_INF;
9108	raw_spin_lock_irq(&cfs_b->lock);	9166	raw_spin_lock_irq(&cfs_b->lock);
9109	cfs_b->period = ns_to_ktime(period);	9167	cfs_b->period = ns_to_ktime(period);
9110	cfs_b->quota = quota;	9168	cfs_b->quota = quota;
9111	cfs_b->runtime = quota;	9169	cfs_b->runtime = quota;
		9170
		9171	/* restart the period timer (if active) to handle new period expiry */
		9172	if (runtime_enabled && cfs_b->timer_active) {
		9173	/* force a reprogram */
		9174	cfs_b->timer_active = 0;
		9175	__start_cfs_bandwidth(cfs_b);
		9176	}
9112	raw_spin_unlock_irq(&cfs_b->lock);	9177	raw_spin_unlock_irq(&cfs_b->lock);
9113		9178
9114	for_each_possible_cpu(i) {	9179	for_each_possible_cpu(i) {
@@ -9116,7 +9181,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9116	struct rq *rq = rq_of(cfs_rq);	9181	struct rq *rq = rq_of(cfs_rq);
9117		9182
9118	raw_spin_lock_irq(&rq->lock);	9183	raw_spin_lock_irq(&rq->lock);
9119	cfs_rq->runtime_enabled = quota != RUNTIME_INF;	9184	cfs_rq->runtime_enabled = runtime_enabled;
9120	cfs_rq->runtime_remaining = 0;	9185	cfs_rq->runtime_remaining = 0;
9121	raw_spin_unlock_irq(&rq->lock);	9186	raw_spin_unlock_irq(&rq->lock);
9122	}	9187	}


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9502aa899f73..af73a8a85eef 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1284,9 +1284,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1284	raw_spin_lock(&cfs_b->lock);	1284	raw_spin_lock(&cfs_b->lock);
1285	if (cfs_b->quota == RUNTIME_INF)	1285	if (cfs_b->quota == RUNTIME_INF)
1286	amount = min_amount;	1286	amount = min_amount;
1287	else if (cfs_b->runtime > 0) {	1287	else {
1288	amount = min(cfs_b->runtime, min_amount);	1288	/* ensure bandwidth timer remains active under consumption */
1289	cfs_b->runtime -= amount;	1289	if (!cfs_b->timer_active)
		1290	__start_cfs_bandwidth(cfs_b);
		1291
		1292	if (cfs_b->runtime > 0) {
		1293	amount = min(cfs_b->runtime, min_amount);
		1294	cfs_b->runtime -= amount;
		1295	cfs_b->idle = 0;
		1296	}
1290	}	1297	}
1291	raw_spin_unlock(&cfs_b->lock);	1298	raw_spin_unlock(&cfs_b->lock);
1292		1299
@@ -1315,6 +1322,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1315	__account_cfs_rq_runtime(cfs_rq, delta_exec);	1322	__account_cfs_rq_runtime(cfs_rq, delta_exec);
1316	}	1323	}
1317		1324
		1325	/*
		1326	* Responsible for refilling a task_group's bandwidth and unthrottling its
		1327	* cfs_rqs as appropriate. If there has been no activity within the last
		1328	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
		1329	* used to track this state.
		1330	*/
		1331	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
		1332	{
		1333	int idle = 1;
		1334
		1335	raw_spin_lock(&cfs_b->lock);
		1336	/* no need to continue the timer with no bandwidth constraint */
		1337	if (cfs_b->quota == RUNTIME_INF)
		1338	goto out_unlock;
		1339
		1340	idle = cfs_b->idle;
		1341	cfs_b->runtime = cfs_b->quota;
		1342
		1343	/* mark as potentially idle for the upcoming period */
		1344	cfs_b->idle = 1;
		1345	out_unlock:
		1346	if (idle)
		1347	cfs_b->timer_active = 0;
		1348	raw_spin_unlock(&cfs_b->lock);
		1349
		1350	return idle;
		1351	}
1318	#else	1352	#else
1319	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,	1353	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1320	unsigned long delta_exec) {}	1354	unsigned long delta_exec) {}