aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPhil Auld <pauld@redhat.com>2018-10-08 10:36:40 -0400
committerIngo Molnar <mingo@kernel.org>2018-10-11 07:10:18 -0400
commitbaa9be4ffb55876923dc9716abc0a448e510ba30 (patch)
tree34b27ae24bb7b748c667d540a7a2513b2ab16aab /kernel
parente054637597ba36d3729ba6a3a3dd7aad8e2a3003 (diff)
sched/fair: Fix throttle_list starvation with low CFS quota
With a very low cpu.cfs_quota_us setting, such as the minimum of 1000, distribute_cfs_runtime may not empty the throttled_list before it runs out of runtime to distribute. In that case, due to the change from c06f04c7048 to put throttled entries at the head of the list, later entries on the list will starve. Essentially, the same X processes will get pulled off the list, given CPU time and then, when expired, get put back on the head of the list where distribute_cfs_runtime will give runtime to the same set of processes leaving the rest. Fix the issue by setting a bit in struct cfs_bandwidth when distribute_cfs_runtime is running, so that the code in throttle_cfs_rq can decide to put the throttled entry on the tail or the head of the list. The bit is set/cleared by the callers of distribute_cfs_runtime while they hold cfs_bandwidth->lock. This is easy to reproduce with a handful of CPU consumers. I use 'crash' on the live system. In some cases you can simply look at the throttled list and see the later entries are not changing: crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3 1 ffff90b56cb2d200 -976050 2 ffff90b56cb2cc00 -484925 3 ffff90b56cb2bc00 -658814 4 ffff90b56cb2ba00 -275365 5 ffff90b166a45600 -135138 6 ffff90b56cb2da00 -282505 7 ffff90b56cb2e000 -148065 8 ffff90b56cb2fa00 -872591 9 ffff90b56cb2c000 -84687 10 ffff90b56cb2f000 -87237 11 ffff90b166a40a00 -164582 crash> list cfs_rq.throttled_list -H 0xffff90b54f6ade40 -s cfs_rq.runtime_remaining | paste - - | awk '{print $1" "$4}' | pr -t -n3 1 ffff90b56cb2d200 -994147 2 ffff90b56cb2cc00 -306051 3 ffff90b56cb2bc00 -961321 4 ffff90b56cb2ba00 -24490 5 ffff90b166a45600 -135138 6 ffff90b56cb2da00 -282505 7 ffff90b56cb2e000 -148065 8 ffff90b56cb2fa00 -872591 9 ffff90b56cb2c000 -84687 10 ffff90b56cb2f000 -87237 11 ffff90b166a40a00 -164582 Sometimes it is easier to see by finding a process getting starved and looking at the sched_info: crash> task ffff8eb765994500 sched_info PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest" sched_info = { pcount = 8, run_delay = 697094208, last_arrival = 240260125039, last_queued = 240260327513 }, crash> task ffff8eb765994500 sched_info PID: 7800 TASK: ffff8eb765994500 CPU: 16 COMMAND: "cputest" sched_info = { pcount = 8, run_delay = 697094208, last_arrival = 240260125039, last_queued = 240260327513 }, Signed-off-by: Phil Auld <pauld@redhat.com> Reviewed-by: Ben Segall <bsegall@google.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: stable@vger.kernel.org Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop") Link: http://lkml.kernel.org/r/20181008143639.GA4019@pauld.bos.csb Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/fair.c22
-rw-r--r--kernel/sched/sched.h2
2 files changed, 21 insertions, 3 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7fc4a371bdd2..f88e00705b55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,9 +4476,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4476 4476
4477 /* 4477 /*
4478 * Add to the _head_ of the list, so that an already-started 4478 * Add to the _head_ of the list, so that an already-started
4479 * distribute_cfs_runtime will not see us 4479 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4480 * not running add to the tail so that later runqueues don't get starved.
4480 */ 4481 */
4481 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 4482 if (cfs_b->distribute_running)
4483 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4484 else
4485 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4482 4486
4483 /* 4487 /*
4484 * If we're the first throttled task, make sure the bandwidth 4488 * If we're the first throttled task, make sure the bandwidth
@@ -4622,14 +4626,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4622 * in us over-using our runtime if it is all used during this loop, but 4626 * in us over-using our runtime if it is all used during this loop, but
4623 * only by limited amounts in that extreme case. 4627 * only by limited amounts in that extreme case.
4624 */ 4628 */
4625 while (throttled && cfs_b->runtime > 0) { 4629 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4626 runtime = cfs_b->runtime; 4630 runtime = cfs_b->runtime;
4631 cfs_b->distribute_running = 1;
4627 raw_spin_unlock(&cfs_b->lock); 4632 raw_spin_unlock(&cfs_b->lock);
4628 /* we can't nest cfs_b->lock while distributing bandwidth */ 4633 /* we can't nest cfs_b->lock while distributing bandwidth */
4629 runtime = distribute_cfs_runtime(cfs_b, runtime, 4634 runtime = distribute_cfs_runtime(cfs_b, runtime,
4630 runtime_expires); 4635 runtime_expires);
4631 raw_spin_lock(&cfs_b->lock); 4636 raw_spin_lock(&cfs_b->lock);
4632 4637
4638 cfs_b->distribute_running = 0;
4633 throttled = !list_empty(&cfs_b->throttled_cfs_rq); 4639 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4634 4640
4635 cfs_b->runtime -= min(runtime, cfs_b->runtime); 4641 cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4740,6 +4746,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4740 4746
4741 /* confirm we're still not at a refresh boundary */ 4747 /* confirm we're still not at a refresh boundary */
4742 raw_spin_lock(&cfs_b->lock); 4748 raw_spin_lock(&cfs_b->lock);
4749 if (cfs_b->distribute_running) {
4750 raw_spin_unlock(&cfs_b->lock);
4751 return;
4752 }
4753
4743 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { 4754 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4744 raw_spin_unlock(&cfs_b->lock); 4755 raw_spin_unlock(&cfs_b->lock);
4745 return; 4756 return;
@@ -4749,6 +4760,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4749 runtime = cfs_b->runtime; 4760 runtime = cfs_b->runtime;
4750 4761
4751 expires = cfs_b->runtime_expires; 4762 expires = cfs_b->runtime_expires;
4763 if (runtime)
4764 cfs_b->distribute_running = 1;
4765
4752 raw_spin_unlock(&cfs_b->lock); 4766 raw_spin_unlock(&cfs_b->lock);
4753 4767
4754 if (!runtime) 4768 if (!runtime)
@@ -4759,6 +4773,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4759 raw_spin_lock(&cfs_b->lock); 4773 raw_spin_lock(&cfs_b->lock);
4760 if (expires == cfs_b->runtime_expires) 4774 if (expires == cfs_b->runtime_expires)
4761 cfs_b->runtime -= min(runtime, cfs_b->runtime); 4775 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4776 cfs_b->distribute_running = 0;
4762 raw_spin_unlock(&cfs_b->lock); 4777 raw_spin_unlock(&cfs_b->lock);
4763} 4778}
4764 4779
@@ -4867,6 +4882,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4867 cfs_b->period_timer.function = sched_cfs_period_timer; 4882 cfs_b->period_timer.function = sched_cfs_period_timer;
4868 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4883 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4869 cfs_b->slack_timer.function = sched_cfs_slack_timer; 4884 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4885 cfs_b->distribute_running = 0;
4870} 4886}
4871 4887
4872static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4888static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 455fa330de04..9683f458aec7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -346,6 +346,8 @@ struct cfs_bandwidth {
346 int nr_periods; 346 int nr_periods;
347 int nr_throttled; 347 int nr_throttled;
348 u64 throttled_time; 348 u64 throttled_time;
349
350 bool distribute_running;
349#endif 351#endif
350}; 352};
351 353