aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNicholas Piggin <npiggin@gmail.com>2017-08-22 04:43:48 -0400
committerThomas Gleixner <tglx@linutronix.de>2017-08-24 05:40:18 -0400
commit2fe59f507a65dbd734b990a11ebc7488f6f87a24 (patch)
treec4024c13c6e9adb566090a866e17fdc451def536
parent14ccee78fc82f5512908f4424f541549a5705b89 (diff)
timers: Fix excessive granularity of new timers after a nohz idle
When a timer base is idle, it is forwarded when a new timer is added to ensure that granularity does not become excessive. When not idle, the timer tick is expected to increment the base. However there are several problems: - If an existing timer is modified, the base is forwarded only after the index is calculated. - The base is not forwarded by add_timer_on. - There is a window after a timer is restarted from a nohz idle, after it is marked not-idle and before the timer tick on this CPU, where a timer may be added but the ancient base does not get forwarded. These result in excessive granularity (a 1 jiffy timeout can blow out to 100s of jiffies), which cause the rcu lockup detector to trigger, among other things. Fix this by keeping track of whether the timer base has been idle since it was last run or forwarded, and if so then forward it before adding a new timer. There is still a case where mod_timer optimises the case of a pending timer mod with the same expiry time, where the timer can see excessive granularity relative to the new, shorter interval. A comment is added, but it's not changed because it is an important fastpath for networking. This has been tested and found to fix the RCU softlockup messages. Testing was also done with tracing to measure requested versus achieved wakeup latencies for all non-deferrable timers in an idle system (with no lockup watchdogs running). Wakeup latency relative to absolute latency is calculated (note this suffers from round-up skew at low absolute times) and analysed: max avg std upstream 506.0 1.20 4.68 patched 2.0 1.08 0.15 The bug was noticed due to the lockup detector Kconfig changes dropping it out of people's .configs and resulting in larger base clk skew When the lockup detectors are enabled, no CPU can go idle for longer than 4 seconds, which limits the granularity errors. Sub-optimal timer behaviour is observable on a smaller scale in that case: max avg std upstream 9.0 1.05 0.19 patched 2.0 1.04 0.11 Fixes: Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Tested-by: David Miller <davem@davemloft.net> Cc: dzickus@redhat.com Cc: sfr@canb.auug.org.au Cc: mpe@ellerman.id.au Cc: Stephen Boyd <sboyd@codeaurora.org> Cc: linuxarm@huawei.com Cc: abdhalee@linux.vnet.ibm.com Cc: John Stultz <john.stultz@linaro.org> Cc: akpm@linux-foundation.org Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com
-rw-r--r--kernel/time/timer.c50
1 files changed, 41 insertions, 9 deletions
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8f5d1bf18854..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
203 bool migration_enabled; 203 bool migration_enabled;
204 bool nohz_active; 204 bool nohz_active;
205 bool is_idle; 205 bool is_idle;
206 bool must_forward_clk;
206 DECLARE_BITMAP(pending_map, WHEEL_SIZE); 207 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
207 struct hlist_head vectors[WHEEL_SIZE]; 208 struct hlist_head vectors[WHEEL_SIZE];
208} ____cacheline_aligned; 209} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
856 857
857static inline void forward_timer_base(struct timer_base *base) 858static inline void forward_timer_base(struct timer_base *base)
858{ 859{
859 unsigned long jnow = READ_ONCE(jiffies); 860 unsigned long jnow;
860 861
861 /* 862 /*
862 * We only forward the base when it's idle and we have a delta between 863 * We only forward the base when we are idle or have just come out of
863 * base clock and jiffies. 864 * idle (must_forward_clk logic), and have a delta between base clock
865 * and jiffies. In the common case, run_timers will take care of it.
864 */ 866 */
865 if (!base->is_idle || (long) (jnow - base->clk) < 2) 867 if (likely(!base->must_forward_clk))
868 return;
869
870 jnow = READ_ONCE(jiffies);
871 base->must_forward_clk = base->is_idle;
872 if ((long)(jnow - base->clk) < 2)
866 return; 873 return;
867 874
868 /* 875 /*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
938 * same array bucket then just return: 945 * same array bucket then just return:
939 */ 946 */
940 if (timer_pending(timer)) { 947 if (timer_pending(timer)) {
948 /*
949 * The downside of this optimization is that it can result in
950 * larger granularity than you would get from adding a new
951 * timer with this expiry.
952 */
941 if (timer->expires == expires) 953 if (timer->expires == expires)
942 return 1; 954 return 1;
943 955
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
948 * dequeue/enqueue dance. 960 * dequeue/enqueue dance.
949 */ 961 */
950 base = lock_timer_base(timer, &flags); 962 base = lock_timer_base(timer, &flags);
963 forward_timer_base(base);
951 964
952 clk = base->clk; 965 clk = base->clk;
953 idx = calc_wheel_index(expires, clk); 966 idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
964 } 977 }
965 } else { 978 } else {
966 base = lock_timer_base(timer, &flags); 979 base = lock_timer_base(timer, &flags);
980 forward_timer_base(base);
967 } 981 }
968 982
969 ret = detach_if_pending(timer, base, false); 983 ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
991 raw_spin_lock(&base->lock); 1005 raw_spin_lock(&base->lock);
992 WRITE_ONCE(timer->flags, 1006 WRITE_ONCE(timer->flags,
993 (timer->flags & ~TIMER_BASEMASK) | base->cpu); 1007 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
1008 forward_timer_base(base);
994 } 1009 }
995 } 1010 }
996 1011
997 /* Try to forward a stale timer base clock */
998 forward_timer_base(base);
999
1000 timer->expires = expires; 1012 timer->expires = expires;
1001 /* 1013 /*
1002 * If 'idx' was calculated above and the base time did not advance 1014 * If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
1112 WRITE_ONCE(timer->flags, 1124 WRITE_ONCE(timer->flags,
1113 (timer->flags & ~TIMER_BASEMASK) | cpu); 1125 (timer->flags & ~TIMER_BASEMASK) | cpu);
1114 } 1126 }
1127 forward_timer_base(base);
1115 1128
1116 debug_activate(timer, timer->expires); 1129 debug_activate(timer, timer->expires);
1117 internal_add_timer(base, timer); 1130 internal_add_timer(base, timer);
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1497 if (!is_max_delta) 1510 if (!is_max_delta)
1498 expires = basem + (u64)(nextevt - basej) * TICK_NSEC; 1511 expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
1499 /* 1512 /*
1500 * If we expect to sleep more than a tick, mark the base idle: 1513 * If we expect to sleep more than a tick, mark the base idle.
1514 * Also the tick is stopped so any added timer must forward
1515 * the base clk itself to keep granularity small. This idle
1516 * logic is only maintained for the BASE_STD base, deferrable
1517 * timers may still see large granularity skew (by design).
1501 */ 1518 */
1502 if ((expires - basem) > TICK_NSEC) 1519 if ((expires - basem) > TICK_NSEC) {
1520 base->must_forward_clk = true;
1503 base->is_idle = true; 1521 base->is_idle = true;
1522 }
1504 } 1523 }
1505 raw_spin_unlock(&base->lock); 1524 raw_spin_unlock(&base->lock);
1506 1525
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
1611{ 1630{
1612 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 1631 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1613 1632
1633 /*
1634 * must_forward_clk must be cleared before running timers so that any
1635 * timer functions that call mod_timer will not try to forward the
1636 * base. idle trcking / clock forwarding logic is only used with
1637 * BASE_STD timers.
1638 *
1639 * The deferrable base does not do idle tracking at all, so we do
1640 * not forward it. This can result in very large variations in
1641 * granularity for deferrable timers, but they can be deferred for
1642 * long periods due to idle.
1643 */
1644 base->must_forward_clk = false;
1645
1614 __run_timers(base); 1646 __run_timers(base);
1615 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) 1647 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
1616 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); 1648 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));