2 files changed, 122 insertions, 1 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 35c91859f8a6..6baade0d7649 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -259,7 +259,7 @@ struct cfs_bandwidth {
        u64 runtime_expires;
        int idle, timer_active;
-        struct hrtimer period_timer;
+        struct hrtimer period_timer, slack_timer;
        struct list_head throttled_cfs_rq;
        /* statistics */
@@ -421,6 +421,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
 static inline u64 default_cfs_period(void);
 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
@@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
 }
 #else
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d201f28c1de7..1ca2cd44d64a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        /* return excess runtime on last dequeue */
+        return_cfs_rq_runtime(cfs_rq);
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
 }
@@ -1674,6 +1679,108 @@ out_unlock:
        return idle;
 }
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+        struct hrtimer *refresh_timer = &cfs_b->period_timer;
+        u64 remaining;
+        /* if the call-back is running a quota refresh is already occurring */
+        if (hrtimer_callback_running(refresh_timer))
+                return 1;
+        /* is a quota refresh about to occur? */
+        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+        if (remaining < min_expire)
+                return 1;
+        return 0;
+}
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+        /* if there's a quota refresh soon don't bother with slack */
+        if (runtime_refresh_within(cfs_b, min_left))
+                return;
+        start_bandwidth_timer(&cfs_b->slack_timer,
+                                ns_to_ktime(cfs_bandwidth_slack_period));
+}
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+        if (slack_runtime <= 0)
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF &&
+            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+                cfs_b->runtime += slack_runtime;
+                /* we are under rq->lock, defer unthrottling using a timer */
+                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                    !list_empty(&cfs_b->throttled_cfs_rq))
+                        start_cfs_slack_bandwidth(cfs_b);
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        /* even if it's not valid for return we don't want to try again */
+        cfs_rq->runtime_remaining -= slack_runtime;
+}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+                return;
+        __return_cfs_rq_runtime(cfs_rq);
+}
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+        u64 expires;
+        /* confirm we're still not at a refresh boundary */
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+                runtime = cfs_b->runtime;
+                cfs_b->runtime = 0;
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        if (!runtime)
+                return;
+        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+        raw_spin_lock(&cfs_b->lock);
+        if (expires == cfs_b->runtime_expires)
+                cfs_b->runtime = runtime;
+        raw_spin_unlock(&cfs_b->lock);
+}
 /*
 * When a group wakes up we want to make sure that its quota is not already
 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {

diff --git a/kernel/sched.c b/kernel/sched.c index 35c91859f8a6..6baade0d7649 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -259,7 +259,7 @@ struct cfs_bandwidth {
259	u64 runtime_expires;	259	u64 runtime_expires;
260		260
261	int idle, timer_active;	261	int idle, timer_active;
262	struct hrtimer period_timer;	262	struct hrtimer period_timer, slack_timer;
263	struct list_head throttled_cfs_rq;	263	struct list_head throttled_cfs_rq;
264		264
265	/* statistics */	265	/* statistics */
@@ -421,6 +421,16 @@ static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
421		421
422	static inline u64 default_cfs_period(void);	422	static inline u64 default_cfs_period(void);
423	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);	423	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
		424	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
		425
		426	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
		427	{
		428	struct cfs_bandwidth *cfs_b =
		429	container_of(timer, struct cfs_bandwidth, slack_timer);
		430	do_sched_cfs_slack_timer(cfs_b);
		431
		432	return HRTIMER_NORESTART;
		433	}
424		434
425	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)	435	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
426	{	436	{
@@ -453,6 +463,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
453	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);	463	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
454	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	464	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
455	cfs_b->period_timer.function = sched_cfs_period_timer;	465	cfs_b->period_timer.function = sched_cfs_period_timer;
		466	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
		467	cfs_b->slack_timer.function = sched_cfs_slack_timer;
456	}	468	}
457		469
458	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)	470	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -488,6 +500,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
488	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)	500	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
489	{	501	{
490	hrtimer_cancel(&cfs_b->period_timer);	502	hrtimer_cancel(&cfs_b->period_timer);
		503	hrtimer_cancel(&cfs_b->slack_timer);
491	}	504	}
492	#else	505	#else
493	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	506	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d201f28c1de7..1ca2cd44d64a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
1052	__clear_buddies_skip(se);	1052	__clear_buddies_skip(se);
1053	}	1053	}
1054		1054
		1055	static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
		1056
1055	static void	1057	static void
1056	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)	1058	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1057	{	1059	{
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
1090	if (!(flags & DEQUEUE_SLEEP))	1092	if (!(flags & DEQUEUE_SLEEP))
1091	se->vruntime -= cfs_rq->min_vruntime;	1093	se->vruntime -= cfs_rq->min_vruntime;
1092		1094
		1095	/* return excess runtime on last dequeue */
		1096	return_cfs_rq_runtime(cfs_rq);
		1097
1093	update_min_vruntime(cfs_rq);	1098	update_min_vruntime(cfs_rq);
1094	update_cfs_shares(cfs_rq);	1099	update_cfs_shares(cfs_rq);
1095	}	1100	}
@@ -1674,6 +1679,108 @@ out_unlock:
1674	return idle;	1679	return idle;
1675	}	1680	}
1676		1681
		1682	/* a cfs_rq won't donate quota below this amount */
		1683	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
		1684	/* minimum remaining period time to redistribute slack quota */
		1685	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
		1686	/* how long we wait to gather additional slack before distributing */
		1687	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
		1688
		1689	/* are we near the end of the current quota period? */
		1690	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
		1691	{
		1692	struct hrtimer *refresh_timer = &cfs_b->period_timer;
		1693	u64 remaining;
		1694
		1695	/* if the call-back is running a quota refresh is already occurring */
		1696	if (hrtimer_callback_running(refresh_timer))
		1697	return 1;
		1698
		1699	/* is a quota refresh about to occur? */
		1700	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
		1701	if (remaining < min_expire)
		1702	return 1;
		1703
		1704	return 0;
		1705	}
		1706
		1707	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
		1708	{
		1709	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
		1710
		1711	/* if there's a quota refresh soon don't bother with slack */
		1712	if (runtime_refresh_within(cfs_b, min_left))
		1713	return;
		1714
		1715	start_bandwidth_timer(&cfs_b->slack_timer,
		1716	ns_to_ktime(cfs_bandwidth_slack_period));
		1717	}
		1718
		1719	/* we know any runtime found here is valid as update_curr() precedes return */
		1720	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
		1721	{
		1722	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
		1723	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
		1724
		1725	if (slack_runtime <= 0)
		1726	return;
		1727
		1728	raw_spin_lock(&cfs_b->lock);
		1729	if (cfs_b->quota != RUNTIME_INF &&
		1730	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
		1731	cfs_b->runtime += slack_runtime;
		1732
		1733	/* we are under rq->lock, defer unthrottling using a timer */
		1734	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
		1735	!list_empty(&cfs_b->throttled_cfs_rq))
		1736	start_cfs_slack_bandwidth(cfs_b);
		1737	}
		1738	raw_spin_unlock(&cfs_b->lock);
		1739
		1740	/* even if it's not valid for return we don't want to try again */
		1741	cfs_rq->runtime_remaining -= slack_runtime;
		1742	}
		1743
		1744	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
		1745	{
		1746	if (!cfs_rq->runtime_enabled \|\| !cfs_rq->nr_running)
		1747	return;
		1748
		1749	__return_cfs_rq_runtime(cfs_rq);
		1750	}
		1751
		1752	/*
		1753	* This is done with a timer (instead of inline with bandwidth return) since
		1754	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
		1755	*/
		1756	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
		1757	{
		1758	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
		1759	u64 expires;
		1760
		1761	/* confirm we're still not at a refresh boundary */
		1762	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
		1763	return;
		1764
		1765	raw_spin_lock(&cfs_b->lock);
		1766	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
		1767	runtime = cfs_b->runtime;
		1768	cfs_b->runtime = 0;
		1769	}
		1770	expires = cfs_b->runtime_expires;
		1771	raw_spin_unlock(&cfs_b->lock);
		1772
		1773	if (!runtime)
		1774	return;
		1775
		1776	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
		1777
		1778	raw_spin_lock(&cfs_b->lock);
		1779	if (expires == cfs_b->runtime_expires)
		1780	cfs_b->runtime = runtime;
		1781	raw_spin_unlock(&cfs_b->lock);
		1782	}
		1783
1677	/*	1784	/*
1678	* When a group wakes up we want to make sure that its quota is not already	1785	* When a group wakes up we want to make sure that its quota is not already
1679	* expired/exceeded, otherwise it may be allowed to steal additional ticks of	1786	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1715	unsigned long delta_exec) {}	1822	unsigned long delta_exec) {}
1716	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}	1823	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1717	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}	1824	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
		1825	static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1718		1826
1719	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)	1827	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1720	{	1828	{