diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-08-26 12:02:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-08-26 12:02:18 -0400 |
commit | 0adb8f3d312966bd3e712248b48098ce086d03bd (patch) | |
tree | 948eed0b139176ca4c79cefe51d38e8adc09723e | |
parent | 53ede64de3da17f76628b493f5e19a01804939d3 (diff) | |
parent | 2fe59f507a65dbd734b990a11ebc7488f6f87a24 (diff) |
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer fix from Ingo Molnar:
"Fix a timer granularity handling race+bug, which would manifest itself
by spuriously increasing timeouts of some timers (from 1 jiffy to ~500
jiffies in the worst case measured) in certain nohz states"
* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
timers: Fix excessive granularity of new timers after a nohz idle
-rw-r--r-- | kernel/time/timer.c | 50 |
1 files changed, 41 insertions, 9 deletions
diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c | |||
@@ -203,6 +203,7 @@ struct timer_base { | |||
203 | bool migration_enabled; | 203 | bool migration_enabled; |
204 | bool nohz_active; | 204 | bool nohz_active; |
205 | bool is_idle; | 205 | bool is_idle; |
206 | bool must_forward_clk; | ||
206 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); | 207 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); |
207 | struct hlist_head vectors[WHEEL_SIZE]; | 208 | struct hlist_head vectors[WHEEL_SIZE]; |
208 | } ____cacheline_aligned; | 209 | } ____cacheline_aligned; |
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) | |||
856 | 857 | ||
857 | static inline void forward_timer_base(struct timer_base *base) | 858 | static inline void forward_timer_base(struct timer_base *base) |
858 | { | 859 | { |
859 | unsigned long jnow = READ_ONCE(jiffies); | 860 | unsigned long jnow; |
860 | 861 | ||
861 | /* | 862 | /* |
862 | * We only forward the base when it's idle and we have a delta between | 863 | * We only forward the base when we are idle or have just come out of |
863 | * base clock and jiffies. | 864 | * idle (must_forward_clk logic), and have a delta between base clock |
865 | * and jiffies. In the common case, run_timers will take care of it. | ||
864 | */ | 866 | */ |
865 | if (!base->is_idle || (long) (jnow - base->clk) < 2) | 867 | if (likely(!base->must_forward_clk)) |
868 | return; | ||
869 | |||
870 | jnow = READ_ONCE(jiffies); | ||
871 | base->must_forward_clk = base->is_idle; | ||
872 | if ((long)(jnow - base->clk) < 2) | ||
866 | return; | 873 | return; |
867 | 874 | ||
868 | /* | 875 | /* |
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
938 | * same array bucket then just return: | 945 | * same array bucket then just return: |
939 | */ | 946 | */ |
940 | if (timer_pending(timer)) { | 947 | if (timer_pending(timer)) { |
948 | /* | ||
949 | * The downside of this optimization is that it can result in | ||
950 | * larger granularity than you would get from adding a new | ||
951 | * timer with this expiry. | ||
952 | */ | ||
941 | if (timer->expires == expires) | 953 | if (timer->expires == expires) |
942 | return 1; | 954 | return 1; |
943 | 955 | ||
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
948 | * dequeue/enqueue dance. | 960 | * dequeue/enqueue dance. |
949 | */ | 961 | */ |
950 | base = lock_timer_base(timer, &flags); | 962 | base = lock_timer_base(timer, &flags); |
963 | forward_timer_base(base); | ||
951 | 964 | ||
952 | clk = base->clk; | 965 | clk = base->clk; |
953 | idx = calc_wheel_index(expires, clk); | 966 | idx = calc_wheel_index(expires, clk); |
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
964 | } | 977 | } |
965 | } else { | 978 | } else { |
966 | base = lock_timer_base(timer, &flags); | 979 | base = lock_timer_base(timer, &flags); |
980 | forward_timer_base(base); | ||
967 | } | 981 | } |
968 | 982 | ||
969 | ret = detach_if_pending(timer, base, false); | 983 | ret = detach_if_pending(timer, base, false); |
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) | |||
991 | raw_spin_lock(&base->lock); | 1005 | raw_spin_lock(&base->lock); |
992 | WRITE_ONCE(timer->flags, | 1006 | WRITE_ONCE(timer->flags, |
993 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); | 1007 | (timer->flags & ~TIMER_BASEMASK) | base->cpu); |
1008 | forward_timer_base(base); | ||
994 | } | 1009 | } |
995 | } | 1010 | } |
996 | 1011 | ||
997 | /* Try to forward a stale timer base clock */ | ||
998 | forward_timer_base(base); | ||
999 | |||
1000 | timer->expires = expires; | 1012 | timer->expires = expires; |
1001 | /* | 1013 | /* |
1002 | * If 'idx' was calculated above and the base time did not advance | 1014 | * If 'idx' was calculated above and the base time did not advance |
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
1112 | WRITE_ONCE(timer->flags, | 1124 | WRITE_ONCE(timer->flags, |
1113 | (timer->flags & ~TIMER_BASEMASK) | cpu); | 1125 | (timer->flags & ~TIMER_BASEMASK) | cpu); |
1114 | } | 1126 | } |
1127 | forward_timer_base(base); | ||
1115 | 1128 | ||
1116 | debug_activate(timer, timer->expires); | 1129 | debug_activate(timer, timer->expires); |
1117 | internal_add_timer(base, timer); | 1130 | internal_add_timer(base, timer); |
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) | |||
1497 | if (!is_max_delta) | 1510 | if (!is_max_delta) |
1498 | expires = basem + (u64)(nextevt - basej) * TICK_NSEC; | 1511 | expires = basem + (u64)(nextevt - basej) * TICK_NSEC; |
1499 | /* | 1512 | /* |
1500 | * If we expect to sleep more than a tick, mark the base idle: | 1513 | * If we expect to sleep more than a tick, mark the base idle. |
1514 | * Also the tick is stopped so any added timer must forward | ||
1515 | * the base clk itself to keep granularity small. This idle | ||
1516 | * logic is only maintained for the BASE_STD base, deferrable | ||
1517 | * timers may still see large granularity skew (by design). | ||
1501 | */ | 1518 | */ |
1502 | if ((expires - basem) > TICK_NSEC) | 1519 | if ((expires - basem) > TICK_NSEC) { |
1520 | base->must_forward_clk = true; | ||
1503 | base->is_idle = true; | 1521 | base->is_idle = true; |
1522 | } | ||
1504 | } | 1523 | } |
1505 | raw_spin_unlock(&base->lock); | 1524 | raw_spin_unlock(&base->lock); |
1506 | 1525 | ||
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) | |||
1611 | { | 1630 | { |
1612 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | 1631 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); |
1613 | 1632 | ||
1633 | /* | ||
1634 | * must_forward_clk must be cleared before running timers so that any | ||
1635 | * timer functions that call mod_timer will not try to forward the | ||
1636 | * base. idle trcking / clock forwarding logic is only used with | ||
1637 | * BASE_STD timers. | ||
1638 | * | ||
1639 | * The deferrable base does not do idle tracking at all, so we do | ||
1640 | * not forward it. This can result in very large variations in | ||
1641 | * granularity for deferrable timers, but they can be deferred for | ||
1642 | * long periods due to idle. | ||
1643 | */ | ||
1644 | base->must_forward_clk = false; | ||
1645 | |||
1614 | __run_timers(base); | 1646 | __run_timers(base); |
1615 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) | 1647 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) |
1616 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); | 1648 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); |