aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched_fair.c
diff options
context:
space:
mode:
authorPaul Turner <pjt@google.com>2011-07-21 12:43:41 -0400
committerIngo Molnar <mingo@elte.hu>2011-08-14 06:03:54 -0400
commitd8b4986d3dbc4fabc2054d63f1d31d6ed2fb1ca8 (patch)
treed6afd92e5425f64b337c916d12dc58ca101c334d /kernel/sched_fair.c
parente8da1b18b32064c43881bceef0f051c2110c9ab9 (diff)
sched: Return unused runtime on group dequeue
When a local cfs_rq blocks we return the majority of its remaining quota to the global bandwidth pool for use by other runqueues. We do this only when the quota is current and there is more than min_cfs_rq_quota [1ms by default] of runtime remaining on the rq. In the case where there are throttled runqueues and we have sufficient bandwidth to meter out a slice, a second timer is kicked off to handle this delivery, unthrottling where appropriate. Using a 'worst case' antagonist which executes on each cpu for 1ms before moving onto the next on a fairly large machine: no quota generations: 197.47 ms /cgroup/a/cpuacct.usage 199.46 ms /cgroup/a/cpuacct.usage 205.46 ms /cgroup/a/cpuacct.usage 198.46 ms /cgroup/a/cpuacct.usage 208.39 ms /cgroup/a/cpuacct.usage Since we are allowed to use "stale" quota our usage is effectively bounded by the rate of input into the global pool and performance is relatively stable. with quota generations [1s increments]: 119.58 ms /cgroup/a/cpuacct.usage 119.65 ms /cgroup/a/cpuacct.usage 119.64 ms /cgroup/a/cpuacct.usage 119.63 ms /cgroup/a/cpuacct.usage 119.60 ms /cgroup/a/cpuacct.usage The large deficit here is due to quota generations (/intentionally/) preventing us from now using previously stranded slack quota. The cost is that this quota becomes unavailable. with quota generations and quota return: 200.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 198.09 ms /cgroup/a/cpuacct.usage 200.09 ms /cgroup/a/cpuacct.usage 200.06 ms /cgroup/a/cpuacct.usage By returning unused quota we're able to both stably consume our desired quota and prevent unintentional overages due to the abuse of slack quota from previous quota periods (especially on a large machine). Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched_fair.c')
-rw-r--r--kernel/sched_fair.c108
1 files changed, 108 insertions, 0 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index d201f28c1de7..1ca2cd44d64a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1052,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1052 __clear_buddies_skip(se); 1052 __clear_buddies_skip(se);
1053} 1053}
1054 1054
1055static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1056
1055static void 1057static void
1056dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1058dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1057{ 1059{
@@ -1090,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1090 if (!(flags & DEQUEUE_SLEEP)) 1092 if (!(flags & DEQUEUE_SLEEP))
1091 se->vruntime -= cfs_rq->min_vruntime; 1093 se->vruntime -= cfs_rq->min_vruntime;
1092 1094
1095 /* return excess runtime on last dequeue */
1096 return_cfs_rq_runtime(cfs_rq);
1097
1093 update_min_vruntime(cfs_rq); 1098 update_min_vruntime(cfs_rq);
1094 update_cfs_shares(cfs_rq); 1099 update_cfs_shares(cfs_rq);
1095} 1100}
@@ -1674,6 +1679,108 @@ out_unlock:
1674 return idle; 1679 return idle;
1675} 1680}
1676 1681
1682/* a cfs_rq won't donate quota below this amount */
1683static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
1684/* minimum remaining period time to redistribute slack quota */
1685static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
1686/* how long we wait to gather additional slack before distributing */
1687static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
1688
1689/* are we near the end of the current quota period? */
1690static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
1691{
1692 struct hrtimer *refresh_timer = &cfs_b->period_timer;
1693 u64 remaining;
1694
1695 /* if the call-back is running a quota refresh is already occurring */
1696 if (hrtimer_callback_running(refresh_timer))
1697 return 1;
1698
1699 /* is a quota refresh about to occur? */
1700 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
1701 if (remaining < min_expire)
1702 return 1;
1703
1704 return 0;
1705}
1706
1707static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
1708{
1709 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
1710
1711 /* if there's a quota refresh soon don't bother with slack */
1712 if (runtime_refresh_within(cfs_b, min_left))
1713 return;
1714
1715 start_bandwidth_timer(&cfs_b->slack_timer,
1716 ns_to_ktime(cfs_bandwidth_slack_period));
1717}
1718
1719/* we know any runtime found here is valid as update_curr() precedes return */
1720static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1721{
1722 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
1723 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
1724
1725 if (slack_runtime <= 0)
1726 return;
1727
1728 raw_spin_lock(&cfs_b->lock);
1729 if (cfs_b->quota != RUNTIME_INF &&
1730 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
1731 cfs_b->runtime += slack_runtime;
1732
1733 /* we are under rq->lock, defer unthrottling using a timer */
1734 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
1735 !list_empty(&cfs_b->throttled_cfs_rq))
1736 start_cfs_slack_bandwidth(cfs_b);
1737 }
1738 raw_spin_unlock(&cfs_b->lock);
1739
1740 /* even if it's not valid for return we don't want to try again */
1741 cfs_rq->runtime_remaining -= slack_runtime;
1742}
1743
1744static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
1745{
1746 if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
1747 return;
1748
1749 __return_cfs_rq_runtime(cfs_rq);
1750}
1751
1752/*
1753 * This is done with a timer (instead of inline with bandwidth return) since
1754 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
1755 */
1756static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
1757{
1758 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
1759 u64 expires;
1760
1761 /* confirm we're still not at a refresh boundary */
1762 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
1763 return;
1764
1765 raw_spin_lock(&cfs_b->lock);
1766 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
1767 runtime = cfs_b->runtime;
1768 cfs_b->runtime = 0;
1769 }
1770 expires = cfs_b->runtime_expires;
1771 raw_spin_unlock(&cfs_b->lock);
1772
1773 if (!runtime)
1774 return;
1775
1776 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
1777
1778 raw_spin_lock(&cfs_b->lock);
1779 if (expires == cfs_b->runtime_expires)
1780 cfs_b->runtime = runtime;
1781 raw_spin_unlock(&cfs_b->lock);
1782}
1783
1677/* 1784/*
1678 * When a group wakes up we want to make sure that its quota is not already 1785 * When a group wakes up we want to make sure that its quota is not already
1679 * expired/exceeded, otherwise it may be allowed to steal additional ticks of 1786 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1715,6 +1822,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1715 unsigned long delta_exec) {} 1822 unsigned long delta_exec) {}
1716static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 1823static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1717static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 1824static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
1825static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
1718 1826
1719static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 1827static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
1720{ 1828{