aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2016-04-13 09:56:51 -0400
committerIngo Molnar <mingo@kernel.org>2016-04-23 08:20:42 -0400
commit1f41906a6fda1114debd3898668bd7ab6470ee41 (patch)
tree3960f1f67cd11aeadc368fee9dce6f393ad56ed3 /kernel/sched
parentcee1afce3053e7aa0793fbd5f2e845fa2cef9e33 (diff)
sched/fair: Correctly handle nohz ticks CPU load accounting
Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask mode. In fact "nohz" or "dynticks" only mean that we exit the periodic mode and we try to minimize the ticks as much as possible. The nohz subsystem uses a confusing terminology with the internal state "ts->tick_stopped" which is also available through its public interface with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead reduced with the best effort rather than stopped. In the best case the tick can indeed be actually stopped but there is no guarantee about that. If a timer needs to fire one second later, a tick will fire while the CPU is in nohz mode and this is a very common scenario. Now this confusion happens to be a problem with CPU load updates: cpu_load_update_active() doesn't handle nohz ticks correctly because it assumes that ticks are completely stopped in nohz mode and that cpu_load_update_active() can't be called in dynticks mode. When that happens, the whole previous tickless load is ignored and the function just records the load for the current tick, ignoring potentially long idle periods behind. In order to solve this, we could account the current load for the previous nohz time but there is a risk that we account the load of a task that got freshly enqueued for the whole nohz period. So instead, lets record the dynticks load on nohz frame entry so we know what to record in case of nohz ticks, then use this record to account the tickless load on nohz ticks and nohz frame end. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Byungchul Park <byungchul.park@lge.com> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/fair.c97
1 files changed, 63 insertions, 34 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecd81c4ebb56..b70367a3e1ef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4563,7 +4563,6 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4563 * @this_rq: The rq to update statistics for 4563 * @this_rq: The rq to update statistics for
4564 * @this_load: The current load 4564 * @this_load: The current load
4565 * @pending_updates: The number of missed updates 4565 * @pending_updates: The number of missed updates
4566 * @active: !0 for NOHZ_FULL
4567 * 4566 *
4568 * Update rq->cpu_load[] statistics. This function is usually called every 4567 * Update rq->cpu_load[] statistics. This function is usually called every
4569 * scheduler tick (TICK_NSEC). 4568 * scheduler tick (TICK_NSEC).
@@ -4592,12 +4591,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4592 * load[i]_n = (1 - 1/2^i)^n * load[i]_0 4591 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
4593 * 4592 *
4594 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra 4593 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
4595 * term. See the @active paramter. 4594 * term.
4596 */ 4595 */
4597static void __cpu_load_update(struct rq *this_rq, unsigned long this_load, 4596static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
4598 unsigned long pending_updates, int active) 4597 unsigned long pending_updates)
4599{ 4598{
4600 unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; 4599 unsigned long tickless_load = this_rq->cpu_load[0];
4601 int i, scale; 4600 int i, scale;
4602 4601
4603 this_rq->nr_load_updates++; 4602 this_rq->nr_load_updates++;
@@ -4642,10 +4641,23 @@ static unsigned long weighted_cpuload(const int cpu)
4642} 4641}
4643 4642
4644#ifdef CONFIG_NO_HZ_COMMON 4643#ifdef CONFIG_NO_HZ_COMMON
4645static void __cpu_load_update_nohz(struct rq *this_rq, 4644/*
4646 unsigned long curr_jiffies, 4645 * There is no sane way to deal with nohz on smp when using jiffies because the
4647 unsigned long load, 4646 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4648 int active) 4647 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4648 *
4649 * Therefore we need to avoid the delta approach from the regular tick when
4650 * possible since that would seriously skew the load calculation. This is why we
4651 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
4652 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
4653 * loop exit, nohz_idle_balance, nohz full exit...)
4654 *
4655 * This means we might still be one tick off for nohz periods.
4656 */
4657
4658static void cpu_load_update_nohz(struct rq *this_rq,
4659 unsigned long curr_jiffies,
4660 unsigned long load)
4649{ 4661{
4650 unsigned long pending_updates; 4662 unsigned long pending_updates;
4651 4663
@@ -4657,24 +4669,11 @@ static void __cpu_load_update_nohz(struct rq *this_rq,
4657 * In the NOHZ_FULL case, we were non-idle, we should consider 4669 * In the NOHZ_FULL case, we were non-idle, we should consider
4658 * its weighted load. 4670 * its weighted load.
4659 */ 4671 */
4660 __cpu_load_update(this_rq, load, pending_updates, active); 4672 cpu_load_update(this_rq, load, pending_updates);
4661 } 4673 }
4662} 4674}
4663 4675
4664/* 4676/*
4665 * There is no sane way to deal with nohz on smp when using jiffies because the
4666 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4667 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4668 *
4669 * Therefore we cannot use the delta approach from the regular tick since that
4670 * would seriously skew the load calculation. However we'll make do for those
4671 * updates happening while idle (nohz_idle_balance) or coming out of idle
4672 * (tick_nohz_idle_exit).
4673 *
4674 * This means we might still be one tick off for nohz periods.
4675 */
4676
4677/*
4678 * Called from nohz_idle_balance() to update the load ratings before doing the 4677 * Called from nohz_idle_balance() to update the load ratings before doing the
4679 * idle balance. 4678 * idle balance.
4680 */ 4679 */
@@ -4686,26 +4685,56 @@ static void cpu_load_update_idle(struct rq *this_rq)
4686 if (weighted_cpuload(cpu_of(this_rq))) 4685 if (weighted_cpuload(cpu_of(this_rq)))
4687 return; 4686 return;
4688 4687
4689 __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0); 4688 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
4690} 4689}
4691 4690
4692/* 4691/*
4693 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. 4692 * Record CPU load on nohz entry so we know the tickless load to account
4693 * on nohz exit. cpu_load[0] happens then to be updated more frequently
4694 * than other cpu_load[idx] but it should be fine as cpu_load readers
4695 * shouldn't rely into synchronized cpu_load[*] updates.
4694 */ 4696 */
4695void cpu_load_update_nohz(int active) 4697void cpu_load_update_nohz_start(void)
4696{ 4698{
4697 struct rq *this_rq = this_rq(); 4699 struct rq *this_rq = this_rq();
4700
4701 /*
4702 * This is all lockless but should be fine. If weighted_cpuload changes
4703 * concurrently we'll exit nohz. And cpu_load write can race with
4704 * cpu_load_update_idle() but both updater would be writing the same.
4705 */
4706 this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
4707}
4708
4709/*
4710 * Account the tickless load in the end of a nohz frame.
4711 */
4712void cpu_load_update_nohz_stop(void)
4713{
4698 unsigned long curr_jiffies = READ_ONCE(jiffies); 4714 unsigned long curr_jiffies = READ_ONCE(jiffies);
4699 unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; 4715 struct rq *this_rq = this_rq();
4716 unsigned long load;
4700 4717
4701 if (curr_jiffies == this_rq->last_load_update_tick) 4718 if (curr_jiffies == this_rq->last_load_update_tick)
4702 return; 4719 return;
4703 4720
4721 load = weighted_cpuload(cpu_of(this_rq));
4704 raw_spin_lock(&this_rq->lock); 4722 raw_spin_lock(&this_rq->lock);
4705 __cpu_load_update_nohz(this_rq, curr_jiffies, load, active); 4723 cpu_load_update_nohz(this_rq, curr_jiffies, load);
4706 raw_spin_unlock(&this_rq->lock); 4724 raw_spin_unlock(&this_rq->lock);
4707} 4725}
4708#endif /* CONFIG_NO_HZ */ 4726#else /* !CONFIG_NO_HZ_COMMON */
4727static inline void cpu_load_update_nohz(struct rq *this_rq,
4728 unsigned long curr_jiffies,
4729 unsigned long load) { }
4730#endif /* CONFIG_NO_HZ_COMMON */
4731
4732static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
4733{
4734 /* See the mess around cpu_load_update_nohz(). */
4735 this_rq->last_load_update_tick = READ_ONCE(jiffies);
4736 cpu_load_update(this_rq, load, 1);
4737}
4709 4738
4710/* 4739/*
4711 * Called from scheduler_tick() 4740 * Called from scheduler_tick()
@@ -4713,11 +4742,11 @@ void cpu_load_update_nohz(int active)
4713void cpu_load_update_active(struct rq *this_rq) 4742void cpu_load_update_active(struct rq *this_rq)
4714{ 4743{
4715 unsigned long load = weighted_cpuload(cpu_of(this_rq)); 4744 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4716 /* 4745
4717 * See the mess around cpu_load_update_idle() / cpu_load_update_nohz(). 4746 if (tick_nohz_tick_stopped())
4718 */ 4747 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
4719 this_rq->last_load_update_tick = jiffies; 4748 else
4720 __cpu_load_update(this_rq, load, 1, 1); 4749 cpu_load_update_periodic(this_rq, load);
4721} 4750}
4722 4751
4723/* 4752/*