aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-08-26 12:02:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-26 12:02:18 -0400
commit0adb8f3d312966bd3e712248b48098ce086d03bd (patch)
tree948eed0b139176ca4c79cefe51d38e8adc09723e
parent53ede64de3da17f76628b493f5e19a01804939d3 (diff)
parent2fe59f507a65dbd734b990a11ebc7488f6f87a24 (diff)
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer fix from Ingo Molnar: "Fix a timer granularity handling race+bug, which would manifest itself by spuriously increasing timeouts of some timers (from 1 jiffy to ~500 jiffies in the worst case measured) in certain nohz states" * 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: timers: Fix excessive granularity of new timers after a nohz idle
-rw-r--r--kernel/time/timer.c50
1 files changed, 41 insertions, 9 deletions
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8f5d1bf18854..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
203 bool migration_enabled; 203 bool migration_enabled;
204 bool nohz_active; 204 bool nohz_active;
205 bool is_idle; 205 bool is_idle;
206 bool must_forward_clk;
206 DECLARE_BITMAP(pending_map, WHEEL_SIZE); 207 DECLARE_BITMAP(pending_map, WHEEL_SIZE);
207 struct hlist_head vectors[WHEEL_SIZE]; 208 struct hlist_head vectors[WHEEL_SIZE];
208} ____cacheline_aligned; 209} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
856 857
857static inline void forward_timer_base(struct timer_base *base) 858static inline void forward_timer_base(struct timer_base *base)
858{ 859{
859 unsigned long jnow = READ_ONCE(jiffies); 860 unsigned long jnow;
860 861
861 /* 862 /*
862 * We only forward the base when it's idle and we have a delta between 863 * We only forward the base when we are idle or have just come out of
863 * base clock and jiffies. 864 * idle (must_forward_clk logic), and have a delta between base clock
865 * and jiffies. In the common case, run_timers will take care of it.
864 */ 866 */
865 if (!base->is_idle || (long) (jnow - base->clk) < 2) 867 if (likely(!base->must_forward_clk))
868 return;
869
870 jnow = READ_ONCE(jiffies);
871 base->must_forward_clk = base->is_idle;
872 if ((long)(jnow - base->clk) < 2)
866 return; 873 return;
867 874
868 /* 875 /*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
938 * same array bucket then just return: 945 * same array bucket then just return:
939 */ 946 */
940 if (timer_pending(timer)) { 947 if (timer_pending(timer)) {
948 /*
949 * The downside of this optimization is that it can result in
950 * larger granularity than you would get from adding a new
951 * timer with this expiry.
952 */
941 if (timer->expires == expires) 953 if (timer->expires == expires)
942 return 1; 954 return 1;
943 955
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
948 * dequeue/enqueue dance. 960 * dequeue/enqueue dance.
949 */ 961 */
950 base = lock_timer_base(timer, &flags); 962 base = lock_timer_base(timer, &flags);
963 forward_timer_base(base);
951 964
952 clk = base->clk; 965 clk = base->clk;
953 idx = calc_wheel_index(expires, clk); 966 idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
964 } 977 }
965 } else { 978 } else {
966 base = lock_timer_base(timer, &flags); 979 base = lock_timer_base(timer, &flags);
980 forward_timer_base(base);
967 } 981 }
968 982
969 ret = detach_if_pending(timer, base, false); 983 ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
991 raw_spin_lock(&base->lock); 1005 raw_spin_lock(&base->lock);
992 WRITE_ONCE(timer->flags, 1006 WRITE_ONCE(timer->flags,
993 (timer->flags & ~TIMER_BASEMASK) | base->cpu); 1007 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
1008 forward_timer_base(base);
994 } 1009 }
995 } 1010 }
996 1011
997 /* Try to forward a stale timer base clock */
998 forward_timer_base(base);
999
1000 timer->expires = expires; 1012 timer->expires = expires;
1001 /* 1013 /*
1002 * If 'idx' was calculated above and the base time did not advance 1014 * If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
1112 WRITE_ONCE(timer->flags, 1124 WRITE_ONCE(timer->flags,
1113 (timer->flags & ~TIMER_BASEMASK) | cpu); 1125 (timer->flags & ~TIMER_BASEMASK) | cpu);
1114 } 1126 }
1127 forward_timer_base(base);
1115 1128
1116 debug_activate(timer, timer->expires); 1129 debug_activate(timer, timer->expires);
1117 internal_add_timer(base, timer); 1130 internal_add_timer(base, timer);
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1497 if (!is_max_delta) 1510 if (!is_max_delta)
1498 expires = basem + (u64)(nextevt - basej) * TICK_NSEC; 1511 expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
1499 /* 1512 /*
1500 * If we expect to sleep more than a tick, mark the base idle: 1513 * If we expect to sleep more than a tick, mark the base idle.
1514 * Also the tick is stopped so any added timer must forward
1515 * the base clk itself to keep granularity small. This idle
1516 * logic is only maintained for the BASE_STD base, deferrable
1517 * timers may still see large granularity skew (by design).
1501 */ 1518 */
1502 if ((expires - basem) > TICK_NSEC) 1519 if ((expires - basem) > TICK_NSEC) {
1520 base->must_forward_clk = true;
1503 base->is_idle = true; 1521 base->is_idle = true;
1522 }
1504 } 1523 }
1505 raw_spin_unlock(&base->lock); 1524 raw_spin_unlock(&base->lock);
1506 1525
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
1611{ 1630{
1612 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); 1631 struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
1613 1632
1633 /*
1634 * must_forward_clk must be cleared before running timers so that any
1635 * timer functions that call mod_timer will not try to forward the
1636 * base. idle trcking / clock forwarding logic is only used with
1637 * BASE_STD timers.
1638 *
1639 * The deferrable base does not do idle tracking at all, so we do
1640 * not forward it. This can result in very large variations in
1641 * granularity for deferrable timers, but they can be deferred for
1642 * long periods due to idle.
1643 */
1644 base->must_forward_clk = false;
1645
1614 __run_timers(base); 1646 __run_timers(base);
1615 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) 1647 if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
1616 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); 1648 __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));