summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorBen Segall <bsegall@google.com>2014-06-20 18:21:20 -0400
committerIngo Molnar <mingo@kernel.org>2014-07-05 05:17:29 -0400
commitc06f04c70489b9deea3212af8375e2f0c2f0b184 (patch)
tree4f2191c49ce53726392f8a4aa983c44c4e13d96e /kernel
parent541b82644d72c1ef4a0587515a619712c1c19bd3 (diff)
sched: Fix potential near-infinite distribute_cfs_runtime() loop
distribute_cfs_runtime() intentionally only hands out enough runtime to bring each cfs_rq to 1 ns of runtime, expecting the cfs_rqs to then take the runtime they need only once they actually get to run. However, if they get to run sufficiently quickly, the period timer is still in distribute_cfs_runtime() and no runtime is available, causing them to throttle. Then distribute has to handle them again, and this can go on until distribute has handed out all of the runtime 1ns at a time, which takes far too long. Instead allow access to the same runtime that distribute is handing out, accepting that corner cases with very low quota may be able to spend the entire cfs_b->runtime during distribute_cfs_runtime, meaning that the runtime directly handed out by distribute_cfs_runtime was over quota. In addition, if a cfs_rq does manage to throttle like this, make sure the existing distribute_cfs_runtime no longer loops over it again. Signed-off-by: Ben Segall <bsegall@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20140620222120.13814.21652.stgit@sword-of-the-dawn.mtv.corp.google.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c41
1 files changed, 20 insertions, 21 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1f9c4571615d..ef5eac773c70 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3361,7 +3361,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3361 cfs_rq->throttled = 1; 3361 cfs_rq->throttled = 1;
3362 cfs_rq->throttled_clock = rq_clock(rq); 3362 cfs_rq->throttled_clock = rq_clock(rq);
3363 raw_spin_lock(&cfs_b->lock); 3363 raw_spin_lock(&cfs_b->lock);
3364 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3364 /*
3365 * Add to the _head_ of the list, so that an already-started
3366 * distribute_cfs_runtime will not see us
3367 */
3368 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3365 if (!cfs_b->timer_active) 3369 if (!cfs_b->timer_active)
3366 __start_cfs_bandwidth(cfs_b, false); 3370 __start_cfs_bandwidth(cfs_b, false);
3367 raw_spin_unlock(&cfs_b->lock); 3371 raw_spin_unlock(&cfs_b->lock);
@@ -3418,7 +3422,8 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3418 u64 remaining, u64 expires) 3422 u64 remaining, u64 expires)
3419{ 3423{
3420 struct cfs_rq *cfs_rq; 3424 struct cfs_rq *cfs_rq;
3421 u64 runtime = remaining; 3425 u64 runtime;
3426 u64 starting_runtime = remaining;
3422 3427
3423 rcu_read_lock(); 3428 rcu_read_lock();
3424 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, 3429 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3449,7 +3454,7 @@ next:
3449 } 3454 }
3450 rcu_read_unlock(); 3455 rcu_read_unlock();
3451 3456
3452 return remaining; 3457 return starting_runtime - remaining;
3453} 3458}
3454 3459
3455/* 3460/*
@@ -3495,22 +3500,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3495 /* account preceding periods in which throttling occurred */ 3500 /* account preceding periods in which throttling occurred */
3496 cfs_b->nr_throttled += overrun; 3501 cfs_b->nr_throttled += overrun;
3497 3502
3498 /*
3499 * There are throttled entities so we must first use the new bandwidth
3500 * to unthrottle them before making it generally available. This
3501 * ensures that all existing debts will be paid before a new cfs_rq is
3502 * allowed to run.
3503 */
3504 runtime = cfs_b->runtime;
3505 runtime_expires = cfs_b->runtime_expires; 3503 runtime_expires = cfs_b->runtime_expires;
3506 cfs_b->runtime = 0;
3507 3504
3508 /* 3505 /*
3509 * This check is repeated as we are holding onto the new bandwidth 3506 * This check is repeated as we are holding onto the new bandwidth while
3510 * while we unthrottle. This can potentially race with an unthrottled 3507 * we unthrottle. This can potentially race with an unthrottled group
3511 * group trying to acquire new bandwidth from the global pool. 3508 * trying to acquire new bandwidth from the global pool. This can result
3509 * in us over-using our runtime if it is all used during this loop, but
3510 * only by limited amounts in that extreme case.
3512 */ 3511 */
3513 while (throttled && runtime > 0) { 3512 while (throttled && cfs_b->runtime > 0) {
3513 runtime = cfs_b->runtime;
3514 raw_spin_unlock(&cfs_b->lock); 3514 raw_spin_unlock(&cfs_b->lock);
3515 /* we can't nest cfs_b->lock while distributing bandwidth */ 3515 /* we can't nest cfs_b->lock while distributing bandwidth */
3516 runtime = distribute_cfs_runtime(cfs_b, runtime, 3516 runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3518,10 +3518,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3518 raw_spin_lock(&cfs_b->lock); 3518 raw_spin_lock(&cfs_b->lock);
3519 3519
3520 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 3520 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3521
3522 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3521 } 3523 }
3522 3524
3523 /* return (any) remaining runtime */
3524 cfs_b->runtime = runtime;
3525 /* 3525 /*
3526 * While we are ensured activity in the period following an 3526 * While we are ensured activity in the period following an
3527 * unthrottle, this also covers the case in which the new bandwidth is 3527 * unthrottle, this also covers the case in which the new bandwidth is
@@ -3632,10 +3632,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3632 return; 3632 return;
3633 } 3633 }
3634 3634
3635 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3635 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3636 runtime = cfs_b->runtime; 3636 runtime = cfs_b->runtime;
3637 cfs_b->runtime = 0; 3637
3638 }
3639 expires = cfs_b->runtime_expires; 3638 expires = cfs_b->runtime_expires;
3640 raw_spin_unlock(&cfs_b->lock); 3639 raw_spin_unlock(&cfs_b->lock);
3641 3640
@@ -3646,7 +3645,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3646 3645
3647 raw_spin_lock(&cfs_b->lock); 3646 raw_spin_lock(&cfs_b->lock);
3648 if (expires == cfs_b->runtime_expires) 3647 if (expires == cfs_b->runtime_expires)
3649 cfs_b->runtime = runtime; 3648 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3650 raw_spin_unlock(&cfs_b->lock); 3649 raw_spin_unlock(&cfs_b->lock);
3651} 3650}
3652 3651