sched: Return unused runtime on group dequeue

When a local cfs_rq blocks we return the majority of its remaining quota to the global bandwidth pool for use by other runqueues. We do this only when the quota is current and there is more than min_cfs_rq_quota [1ms by default] of runtime remaining on the rq. In the case where there are throttled runqueues and we have sufficient bandwidth to meter out a slice, a second timer is kicked off to handle this delivery, unthrottling where appropriate. Using a 'worst case' antagonist which executes on each cpu for 1ms before moving onto the next on a fairly large machine: no quota generations: 197.47 ms /cgroup/a/cpuacct.usage 199.46 ms /cgroup/a/cpuacct.usage 205.46 ms /cgroup/a/cpuacct.usage 198.46 ms /cgroup/a/cpuacct.usage 208.39 ms /cgroup/a/cpuacct.usage Since we are allowed to use "stale" quota our usage is effectively bounded by the rate of input into the global pool and performance is relatively stable. with quota generations [1s increments]: 119.58 ms /cgroup/a/cpuacct.usage 119.65 ms /cgroup/a/cpuacct.usage 119.64 ms /cgroup/a/cpuacct.usage 119.63 ms /cgroup/a/cpuacct.usage 119.60 ms /cgroup/a/cpuacct.usage The large deficit here is due to quota generations (/intentionally/) preventing us from now using previously stranded slack quota. The cost is that this quota becomes unavailable. with quota generations and quota return: 200.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 198.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 200.06 ms /cgroup/a/cpuacct.usage By returning unused quota we're able to both stably consume our desired quota and prevent unintentional overages due to the abuse of slack quota from previous quota periods (especially on a large machine). Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Paul Turner <pjt@google.com> 2011-07-21 12:43:41 -0400
committer: Ingo Molnar <mingo@elte.hu> 2011-08-14 06:03:54 -0400
commit: d8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 (patch)
tree: d6afd92e5425f64b337c916d12dc58ca101c334d /kernel
parent: e8da1b18b32064c43881bceef0f051c2110c9ab9 (diff)
2 files changed, 122 insertions, 1 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 35c91859f8a..6baade0d764 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -259,7 +259,7 @@ struct cfs_bandwidth {
        u64 runtime_expires;
        int idle, timer_active;
-        struct hrtimer period_timer;
+        struct hrtimer period_timer, slack_timer;
        struct list_head throttled_cfs_rq;
        /* statistics */
@@ -421,6 +421,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 static inline u64 default_cfs_period(void);
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
@@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
 }
 #else
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d201f28c1de..1ca2cd44d64 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        /* return excess runtime on last dequeue */
+        return_cfs_rq_runtime(cfs_rq);
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
 }
@@ -1674,6 +1679,108 @@ out_unlock:
        return idle;
 }
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+        struct hrtimer *refresh_timer = &cfs_b->period_timer;
+        u64 remaining;
+        /* if the call-back is running a quota refresh is already occurring */
+        if (hrtimer_callback_running(refresh_timer))
+                return 1;
+        /* is a quota refresh about to occur? */
+        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+        if (remaining < min_expire)
+                return 1;
+        return 0;
+}
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+        /* if there's a quota refresh soon don't bother with slack */
+        if (runtime_refresh_within(cfs_b, min_left))
+                return;
+        start_bandwidth_timer(&cfs_b->slack_timer,
+                                ns_to_ktime(cfs_bandwidth_slack_period));
+}
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+        if (slack_runtime <= 0)
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF &&
+            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+                cfs_b->runtime += slack_runtime;
+                /* we are under rq->lock, defer unthrottling using a timer */
+                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                    !list_empty(&cfs_b->throttled_cfs_rq))
+                        start_cfs_slack_bandwidth(cfs_b);
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        /* even if it's not valid for return we don't want to try again */
+        cfs_rq->runtime_remaining -= slack_runtime;
+}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+                return;
+        __return_cfs_rq_runtime(cfs_rq);
+}
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+        u64 expires;
+        /* confirm we're still not at a refresh boundary */
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+                runtime = cfs_b->runtime;
+                cfs_b->runtime = 0;
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        if (!runtime)
+                return;
+        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+        raw_spin_lock(&cfs_b->lock);
+        if (expires == cfs_b->runtime_expires)
+                cfs_b->runtime = runtime;
+        raw_spin_unlock(&cfs_b->lock);
+}
 /*
 * When a group wakes up we want to make sure that its quota is not already
 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
author	Paul Turner <pjt@google.com>	2011-07-21 12:43:41 -0400
committer	Ingo Molnar <mingo@elte.hu>	2011-08-14 06:03:54 -0400
commit	d8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 (patch)
tree	d6afd92e5425f64b337c916d12dc58ca101c334d /kernel
parent	e8da1b18b32064c43881bceef0f051c2110c9ab9 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 35c91859f8a..6baade0d764 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -259,7 +259,7 @@ struct cfs_bandwidth {
259	u64 runtime_expires;	259	u64 runtime_expires;
260		260
261	int idle, timer_active;	261	int idle, timer_active;
262	struct hrtimer period_timer;	262	struct hrtimer period_timer, slack_timer;
263	struct list_head throttled_cfs_rq;	263	struct list_head throttled_cfs_rq;
264		264
265	/* statistics */	265	/* statistics */
@@ -421,6 +421,16 @@ static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
421		421
422	static inline u64 default_cfs_period(void);	422	static inline u64 default_cfs_period(void);
423	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);	423	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
		424	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
		425
		426	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
		427	{
		428	struct cfs_bandwidth *cfs_b =
		429	container_of(timer, struct cfs_bandwidth, slack_timer);
		430	do_sched_cfs_slack_timer(cfs_b);
		431
		432	return HRTIMER_NORESTART;
		433	}
424		434
425	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)	435	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
426	{	436	{
@@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
453	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);	463	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
454	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	464	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
455	cfs_b->period_timer.function = sched_cfs_period_timer;	465	cfs_b->period_timer.function = sched_cfs_period_timer;
		466	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
		467	cfs_b->slack_timer.function = sched_cfs_slack_timer;
456	}	468	}
457		469
458	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)	470	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
488	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	500	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
489	{	501	{
490	hrtimer_cancel(&cfs_b->period_timer);	502	hrtimer_cancel(&cfs_b->period_timer);
		503	hrtimer_cancel(&cfs_b->slack_timer);
491	}	504	}
492	#else	505	#else
493	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	506	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d201f28c1de..1ca2cd44d64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
1052	__clear_buddies_skip(se);	1052	__clear_buddies_skip(se);
1053	}	1053	}
1054		1054
		1055	static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
		1056
1055	static void	1057	static void
1056	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)	1058	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1057	{	1059	{
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1090	if (!(flags & DEQUEUE_SLEEP))	1092	if (!(flags & DEQUEUE_SLEEP))
1091	se->vruntime -= cfs_rq->min_vruntime;	1093	se->vruntime -= cfs_rq->min_vruntime;
1092		1094
		1095	/* return excess runtime on last dequeue */
		1096	return_cfs_rq_runtime(cfs_rq);
		1097
1093	update_min_vruntime(cfs_rq);	1098	update_min_vruntime(cfs_rq);
1094	update_cfs_shares(cfs_rq);	1099	update_cfs_shares(cfs_rq);
1095	}	1100	}
@@ -1674,6 +1679,108 @@ out_unlock:
1674	return idle;	1679	return idle;
1675	}	1680	}
1676		1681
		1682	/* a cfs_rq won't donate quota below this amount */
		1683	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
		1684	/* minimum remaining period time to redistribute slack quota */
		1685	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
		1686	/* how long we wait to gather additional slack before distributing */
		1687	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
		1688
		1689	/* are we near the end of the current quota period? */
		1690	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
		1691	{
		1692	struct hrtimer *refresh_timer = &cfs_b->period_timer;
		1693	u64 remaining;
		1694
		1695	/* if the call-back is running a quota refresh is already occurring */
		1696	if (hrtimer_callback_running(refresh_timer))
		1697	return 1;
		1698
		1699	/* is a quota refresh about to occur? */
		1700	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
		1701	if (remaining < min_expire)
		1702	return 1;
		1703
		1704	return 0;
		1705	}
		1706
		1707	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
		1708	{
		1709	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
		1710
		1711	/* if there's a quota refresh soon don't bother with slack */
		1712	if (runtime_refresh_within(cfs_b, min_left))
		1713	return;
		1714
		1715	start_bandwidth_timer(&cfs_b->slack_timer,
		1716	ns_to_ktime(cfs_bandwidth_slack_period));
		1717	}
		1718
		1719	/* we know any runtime found here is valid as update_curr() precedes return */
		1720	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
		1721	{
		1722	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
		1723	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
		1724
		1725	if (slack_runtime <= 0)
		1726	return;
		1727
		1728	raw_spin_lock(&cfs_b->lock);
		1729	if (cfs_b->quota != RUNTIME_INF &&
		1730	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
		1731	cfs_b->runtime += slack_runtime;
		1732
		1733	/* we are under rq->lock, defer unthrottling using a timer */
		1734	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
		1735	!list_empty(&cfs_b->throttled_cfs_rq))
		1736	start_cfs_slack_bandwidth(cfs_b);
		1737	}
		1738	raw_spin_unlock(&cfs_b->lock);
		1739
		1740	/* even if it's not valid for return we don't want to try again */
		1741	cfs_rq->runtime_remaining -= slack_runtime;
		1742	}
		1743
		1744	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
		1745	{
		1746	if (!cfs_rq->runtime_enabled \|\| !cfs_rq->nr_running)
		1747	return;
		1748
		1749	__return_cfs_rq_runtime(cfs_rq);
		1750	}
		1751
		1752	/*
		1753	* This is done with a timer (instead of inline with bandwidth return) since
		1754	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
		1755	*/
		1756	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
		1757	{
		1758	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
		1759	u64 expires;
		1760
		1761	/* confirm we're still not at a refresh boundary */
		1762	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
		1763	return;
		1764
		1765	raw_spin_lock(&cfs_b->lock);
		1766	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
		1767	runtime = cfs_b->runtime;
		1768	cfs_b->runtime = 0;
		1769	}
		1770	expires = cfs_b->runtime_expires;
		1771	raw_spin_unlock(&cfs_b->lock);
		1772
		1773	if (!runtime)
		1774	return;
		1775
		1776	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
		1777
		1778	raw_spin_lock(&cfs_b->lock);
		1779	if (expires == cfs_b->runtime_expires)
		1780	cfs_b->runtime = runtime;
		1781	raw_spin_unlock(&cfs_b->lock);
		1782	}
		1783
1677	/*	1784	/*
1678	* When a group wakes up we want to make sure that its quota is not already	1785	* When a group wakes up we want to make sure that its quota is not already
1679	* expired/exceeded, otherwise it may be allowed to steal additional ticks of	1786	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1715	unsigned long delta_exec) {}	1822	unsigned long delta_exec) {}
1716	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	1823	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1717	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}	1824	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
		1825	static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1718		1826
1719	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)	1827	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1720	{	1828	{