sched: Fix potential near-infinite distribute_cfs_runtime() loop

distribute_cfs_runtime() intentionally only hands out enough runtime to bring each cfs_rq to 1 ns of runtime, expecting the cfs_rqs to then take the runtime they need only once they actually get to run. However, if they get to run sufficiently quickly, the period timer is still in distribute_cfs_runtime() and no runtime is available, causing them to throttle. Then distribute has to handle them again, and this can go on until distribute has handed out all of the runtime 1ns at a time, which takes far too long. Instead allow access to the same runtime that distribute is handing out, accepting that corner cases with very low quota may be able to spend the entire cfs_b->runtime during distribute_cfs_runtime, meaning that the runtime directly handed out by distribute_cfs_runtime was over quota. In addition, if a cfs_rq does manage to throttle like this, make sure the existing distribute_cfs_runtime no longer loops over it again. Signed-off-by: Ben Segall <bsegall@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20140620222120.13814.21652.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ben Segall <bsegall@google.com> 2014-06-20 18:21:20 -0400
committer: Ingo Molnar <mingo@kernel.org> 2014-07-05 05:17:29 -0400
commit: c06f04c70489b9deea3212af8375e2f0c2f0b184 (patch)
tree: 4f2191c49ce53726392f8a4aa983c44c4e13d96e /kernel
parent: 541b82644d72c1ef4a0587515a619712c1c19bd3 (diff)
1 files changed, 20 insertions, 21 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1f9c4571615d..ef5eac773c70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled = 1;
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
-        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        /*
+         * Add to the _head_ of the list, so that an already-started
+         * distribute_cfs_runtime will not see us
+         */
+        list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        if (!cfs_b->timer_active)
                __start_cfs_bandwidth(cfs_b, false);
        raw_spin_unlock(&cfs_b->lock);
@@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
                u64 remaining, u64 expires)
 {
        struct cfs_rq *cfs_rq;
-        u64 runtime = remaining;
+        u64 runtime;
+        u64 starting_runtime = remaining;
        rcu_read_lock();
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3449,7 +3454,7 @@ next:
        }
        rcu_read_unlock();
-        return remaining;
+        return starting_runtime - remaining;
 }
 /*
@@ -3495,22 +3500,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        /* account preceding periods in which throttling occurred */
        cfs_b->nr_throttled += overrun;
-        /*
-         * There are throttled entities so we must first use the new bandwidth
-         * to unthrottle them before making it generally available.  This
-         * ensures that all existing debts will be paid before a new cfs_rq is
-         * allowed to run.
-         */
-        runtime = cfs_b->runtime;
        runtime_expires = cfs_b->runtime_expires;
-        cfs_b->runtime = 0;
        /*
-         * This check is repeated as we are holding onto the new bandwidth
+         * This check is repeated as we are holding onto the new bandwidth while
-         * while we unthrottle.  This can potentially race with an unthrottled
+         * we unthrottle. This can potentially race with an unthrottled group
-         * group trying to acquire new bandwidth from the global pool.
+         * trying to acquire new bandwidth from the global pool. This can result
+         * in us over-using our runtime if it is all used during this loop, but
+         * only by limited amounts in that extreme case.
         */
-        while (throttled && runtime > 0) {
+        while (throttled && cfs_b->runtime > 0) {
+                runtime = cfs_b->runtime;
                raw_spin_unlock(&cfs_b->lock);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3518,10 +3518,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
                raw_spin_lock(&cfs_b->lock);
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+                cfs_b->runtime -= min(runtime, cfs_b->runtime);
        }
-        /* return (any) remaining runtime */
-        cfs_b->runtime = runtime;
        /*
         * While we are ensured activity in the period following an
         * unthrottle, this also covers the case in which the new bandwidth is
@@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                return;
        }
-        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
                runtime = cfs_b->runtime;
-                cfs_b->runtime = 0;
-        }
        expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
@@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        raw_spin_lock(&cfs_b->lock);
        if (expires == cfs_b->runtime_expires)
-                cfs_b->runtime = runtime;
+                cfs_b->runtime -= min(runtime, cfs_b->runtime);
        raw_spin_unlock(&cfs_b->lock);
 }
author	Ben Segall <bsegall@google.com>	2014-06-20 18:21:20 -0400
committer	Ingo Molnar <mingo@kernel.org>	2014-07-05 05:17:29 -0400
commit	c06f04c70489b9deea3212af8375e2f0c2f0b184 (patch)
tree	4f2191c49ce53726392f8a4aa983c44c4e13d96e /kernel
parent	541b82644d72c1ef4a0587515a619712c1c19bd3 (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1f9c4571615d..ef5eac773c70 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3361	cfs_rq->throttled = 1;	3361	cfs_rq->throttled = 1;
3362	cfs_rq->throttled_clock = rq_clock(rq);	3362	cfs_rq->throttled_clock = rq_clock(rq);
3363	raw_spin_lock(&cfs_b->lock);	3363	raw_spin_lock(&cfs_b->lock);
3364	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);	3364	/*
		3365	* Add to the _head_ of the list, so that an already-started
		3366	* distribute_cfs_runtime will not see us
		3367	*/
		3368	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3365	if (!cfs_b->timer_active)	3369	if (!cfs_b->timer_active)
3366	__start_cfs_bandwidth(cfs_b, false);	3370	__start_cfs_bandwidth(cfs_b, false);
3367	raw_spin_unlock(&cfs_b->lock);	3371	raw_spin_unlock(&cfs_b->lock);
@@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3418	u64 remaining, u64 expires)	3422	u64 remaining, u64 expires)
3419	{	3423	{
3420	struct cfs_rq *cfs_rq;	3424	struct cfs_rq *cfs_rq;
3421	u64 runtime = remaining;	3425	u64 runtime;
		3426	u64 starting_runtime = remaining;
3422		3427
3423	rcu_read_lock();	3428	rcu_read_lock();
3424	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,	3429	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3449,7 +3454,7 @@ next:
3449	}	3454	}
3450	rcu_read_unlock();	3455	rcu_read_unlock();
3451		3456
3452	return remaining;	3457	return starting_runtime - remaining;
3453	}	3458	}
3454		3459
3455	/*	3460	/*
@@ -3495,22 +3500,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3495	/* account preceding periods in which throttling occurred */	3500	/* account preceding periods in which throttling occurred */
3496	cfs_b->nr_throttled += overrun;	3501	cfs_b->nr_throttled += overrun;
3497		3502
3498	/*
3499	* There are throttled entities so we must first use the new bandwidth
3500	* to unthrottle them before making it generally available. This
3501	* ensures that all existing debts will be paid before a new cfs_rq is
3502	* allowed to run.
3503	*/
3504	runtime = cfs_b->runtime;
3505	runtime_expires = cfs_b->runtime_expires;	3503	runtime_expires = cfs_b->runtime_expires;
3506	cfs_b->runtime = 0;
3507		3504
3508	/*	3505	/*
3509	* This check is repeated as we are holding onto the new bandwidth	3506	* This check is repeated as we are holding onto the new bandwidth while
3510	* while we unthrottle. This can potentially race with an unthrottled	3507	* we unthrottle. This can potentially race with an unthrottled group
3511	* group trying to acquire new bandwidth from the global pool.	3508	* trying to acquire new bandwidth from the global pool. This can result
		3509	* in us over-using our runtime if it is all used during this loop, but
		3510	* only by limited amounts in that extreme case.
3512	*/	3511	*/
3513	while (throttled && runtime > 0) {	3512	while (throttled && cfs_b->runtime > 0) {
		3513	runtime = cfs_b->runtime;
3514	raw_spin_unlock(&cfs_b->lock);	3514	raw_spin_unlock(&cfs_b->lock);
3515	/* we can't nest cfs_b->lock while distributing bandwidth */	3515	/* we can't nest cfs_b->lock while distributing bandwidth */
3516	runtime = distribute_cfs_runtime(cfs_b, runtime,	3516	runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3518,10 +3518,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3518	raw_spin_lock(&cfs_b->lock);	3518	raw_spin_lock(&cfs_b->lock);
3519		3519
3520	throttled = !list_empty(&cfs_b->throttled_cfs_rq);	3520	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
		3521
		3522	cfs_b->runtime -= min(runtime, cfs_b->runtime);
3521	}	3523	}
3522		3524
3523	/* return (any) remaining runtime */
3524	cfs_b->runtime = runtime;
3525	/*	3525	/*
3526	* While we are ensured activity in the period following an	3526	* While we are ensured activity in the period following an
3527	* unthrottle, this also covers the case in which the new bandwidth is	3527	* unthrottle, this also covers the case in which the new bandwidth is
@@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3632	return;	3632	return;
3633	}	3633	}
3634		3634
3635	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {	3635	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3636	runtime = cfs_b->runtime;	3636	runtime = cfs_b->runtime;
3637	cfs_b->runtime = 0;	3637
3638	}
3639	expires = cfs_b->runtime_expires;	3638	expires = cfs_b->runtime_expires;
3640	raw_spin_unlock(&cfs_b->lock);	3639	raw_spin_unlock(&cfs_b->lock);
3641		3640
@@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3646		3645
3647	raw_spin_lock(&cfs_b->lock);	3646	raw_spin_lock(&cfs_b->lock);
3648	if (expires == cfs_b->runtime_expires)	3647	if (expires == cfs_b->runtime_expires)
3649	cfs_b->runtime = runtime;	3648	cfs_b->runtime -= min(runtime, cfs_b->runtime);
3650	raw_spin_unlock(&cfs_b->lock);	3649	raw_spin_unlock(&cfs_b->lock);
3651	}	3650	}
3652		3651