aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/time
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2015-04-14 17:08:58 -0400
committerThomas Gleixner <tglx@linutronix.de>2015-04-22 11:06:50 -0400
commitc1ad348b452aacd784fb97403d03d71723c72ee1 (patch)
tree8f57456095d7125463a9647701acfe24b9d96ffc /kernel/time
parent157d29e101c7d032e886df067aeea1b21a366cc5 (diff)
tick: Nohz: Rework next timer evaluation
The evaluation of the next timer in the nohz code is based on jiffies while all the tick internals are nano seconds based. We have also to convert hrtimer nanoseconds to jiffies in the !highres case. That's just wrong and introduces interesting corner cases. Turn it around and convert the next timer wheel timer expiry and the rcu event to clock monotonic and base all calculations on nanoseconds. That identifies the case where no timer is pending clearly with an absolute expiry value of KTIME_MAX. Makes the code more readable and gets rid of the jiffies magic in the nohz code. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Josh Triplett <josh@joshtriplett.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: John Stultz <john.stultz@linaro.org> Cc: Marcelo Tosatti <mtosatti@redhat.com> Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/hrtimer.c14
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c109
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/timer.c71
-rw-r--r--kernel/time/timer_list.c4
6 files changed, 93 insertions, 109 deletions
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fc6b6d25f93d..179b991cfdcb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1080/** 1080/**
1081 * hrtimer_get_next_event - get the time until next expiry event 1081 * hrtimer_get_next_event - get the time until next expiry event
1082 * 1082 *
1083 * Returns the delta to the next expiry event or KTIME_MAX if no timer 1083 * Returns the next expiry time or KTIME_MAX if no timer is pending.
1084 * is pending.
1085 */ 1084 */
1086ktime_t hrtimer_get_next_event(void) 1085u64 hrtimer_get_next_event(void)
1087{ 1086{
1088 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); 1087 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1089 ktime_t mindelta = { .tv64 = KTIME_MAX }; 1088 u64 expires = KTIME_MAX;
1090 unsigned long flags; 1089 unsigned long flags;
1091 1090
1092 raw_spin_lock_irqsave(&cpu_base->lock, flags); 1091 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1093 1092
1094 if (!__hrtimer_hres_active(cpu_base)) 1093 if (!__hrtimer_hres_active(cpu_base))
1095 mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), 1094 expires = __hrtimer_get_next_event(cpu_base).tv64;
1096 ktime_get());
1097 1095
1098 raw_spin_unlock_irqrestore(&cpu_base->lock, flags); 1096 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1099 1097
1100 if (mindelta.tv64 < 0) 1098 return expires;
1101 mindelta.tv64 = 0;
1102 return mindelta;
1103} 1099}
1104#endif 1100#endif
1105 1101
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd8054c5..65273f0a11ed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -137,3 +137,5 @@ extern void tick_nohz_init(void);
137# else 137# else
138static inline void tick_nohz_init(void) { } 138static inline void tick_nohz_init(void) { }
139#endif 139#endif
140
141extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4c5f4a9dcc0a..753c211f6195 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
582static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, 582static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
583 ktime_t now, int cpu) 583 ktime_t now, int cpu)
584{ 584{
585 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
586 ktime_t last_update, expires, ret = { .tv64 = 0 };
587 unsigned long rcu_delta_jiffies;
588 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); 585 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
589 u64 time_delta; 586 u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
590 587 unsigned long seq, basejiff;
591 time_delta = timekeeping_max_deferment(); 588 ktime_t tick;
592 589
593 /* Read jiffies and the time when jiffies were updated last */ 590 /* Read jiffies and the time when jiffies were updated last */
594 do { 591 do {
595 seq = read_seqbegin(&jiffies_lock); 592 seq = read_seqbegin(&jiffies_lock);
596 last_update = last_jiffies_update; 593 basemono = last_jiffies_update.tv64;
597 last_jiffies = jiffies; 594 basejiff = jiffies;
598 } while (read_seqretry(&jiffies_lock, seq)); 595 } while (read_seqretry(&jiffies_lock, seq));
596 ts->last_jiffies = basejiff;
599 597
600 if (rcu_needs_cpu(&rcu_delta_jiffies) || 598 if (rcu_needs_cpu(basemono, &next_rcu) ||
601 arch_needs_cpu() || irq_work_needs_cpu()) { 599 arch_needs_cpu() || irq_work_needs_cpu()) {
602 next_jiffies = last_jiffies + 1; 600 next_tick = basemono + TICK_NSEC;
603 delta_jiffies = 1;
604 } else { 601 } else {
605 /* Get the next timer wheel timer */ 602 /*
606 next_jiffies = get_next_timer_interrupt(last_jiffies); 603 * Get the next pending timer. If high resolution
607 delta_jiffies = next_jiffies - last_jiffies; 604 * timers are enabled this only takes the timer wheel
608 if (rcu_delta_jiffies < delta_jiffies) { 605 * timers into account. If high resolution timers are
609 next_jiffies = last_jiffies + rcu_delta_jiffies; 606 * disabled this also looks at the next expiring
610 delta_jiffies = rcu_delta_jiffies; 607 * hrtimer.
611 } 608 */
609 next_tmr = get_next_timer_interrupt(basejiff, basemono);
610 ts->next_timer = next_tmr;
611 /* Take the next rcu event into account */
612 next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
612 } 613 }
613 614
614 if ((long)delta_jiffies <= 1) { 615 /*
616 * If the tick is due in the next period, keep it ticking or
617 * restart it proper.
618 */
619 delta = next_tick - basemono;
620 if (delta <= (u64)TICK_NSEC) {
621 tick.tv64 = 0;
615 if (!ts->tick_stopped) 622 if (!ts->tick_stopped)
616 goto out; 623 goto out;
617 if (delta_jiffies == 0) { 624 if (delta == 0) {
618 /* Tick is stopped, but required now. Enforce it */ 625 /* Tick is stopped, but required now. Enforce it */
619 tick_nohz_restart(ts, now); 626 tick_nohz_restart(ts, now);
620 goto out; 627 goto out;
@@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
629 * do_timer() never invoked. Keep track of the fact that it 636 * do_timer() never invoked. Keep track of the fact that it
630 * was the one which had the do_timer() duty last. If this cpu 637 * was the one which had the do_timer() duty last. If this cpu
631 * is the one which had the do_timer() duty last, we limit the 638 * is the one which had the do_timer() duty last, we limit the
632 * sleep time to the timekeeping max_deferement value which we 639 * sleep time to the timekeeping max_deferement value.
633 * retrieved above. Otherwise we can sleep as long as we want. 640 * Otherwise we can sleep as long as we want.
634 */ 641 */
642 delta = timekeeping_max_deferment();
635 if (cpu == tick_do_timer_cpu) { 643 if (cpu == tick_do_timer_cpu) {
636 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 644 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
637 ts->do_timer_last = 1; 645 ts->do_timer_last = 1;
638 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { 646 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
639 time_delta = KTIME_MAX; 647 delta = KTIME_MAX;
640 ts->do_timer_last = 0; 648 ts->do_timer_last = 0;
641 } else if (!ts->do_timer_last) { 649 } else if (!ts->do_timer_last) {
642 time_delta = KTIME_MAX; 650 delta = KTIME_MAX;
643 } 651 }
644 652
645#ifdef CONFIG_NO_HZ_FULL 653#ifdef CONFIG_NO_HZ_FULL
654 /* Limit the tick delta to the maximum scheduler deferment */
646 if (!ts->inidle) 655 if (!ts->inidle)
647 time_delta = min(time_delta, scheduler_tick_max_deferment()); 656 delta = min(delta, scheduler_tick_max_deferment());
648#endif 657#endif
649 658
650 /* 659 /* Calculate the next expiry time */
651 * calculate the expiry time for the next timer wheel 660 if (delta < (KTIME_MAX - basemono))
652 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 661 expires = basemono + delta;
653 * there is no timer pending or at least extremely far into
654 * the future (12 days for HZ=1000). In this case we set the
655 * expiry to the end of time.
656 */
657 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
658 /*
659 * Calculate the time delta for the next timer event.
660 * If the time delta exceeds the maximum time delta
661 * permitted by the current clocksource then adjust
662 * the time delta accordingly to ensure the
663 * clocksource does not wrap.
664 */
665 time_delta = min_t(u64, time_delta,
666 tick_period.tv64 * delta_jiffies);
667 }
668
669 if (time_delta < KTIME_MAX)
670 expires = ktime_add_ns(last_update, time_delta);
671 else 662 else
672 expires.tv64 = KTIME_MAX; 663 expires = KTIME_MAX;
664
665 expires = min_t(u64, expires, next_tick);
666 tick.tv64 = expires;
673 667
674 /* Skip reprogram of event if its not changed */ 668 /* Skip reprogram of event if its not changed */
675 if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) 669 if (ts->tick_stopped && (expires == dev->next_event.tv64))
676 goto out; 670 goto out;
677 671
678 ret = expires;
679
680 /* 672 /*
681 * nohz_stop_sched_tick can be called several times before 673 * nohz_stop_sched_tick can be called several times before
682 * the nohz_restart_sched_tick is called. This happens when 674 * the nohz_restart_sched_tick is called. This happens when
@@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
694 } 686 }
695 687
696 /* 688 /*
697 * If the expiration time == KTIME_MAX, then 689 * If the expiration time == KTIME_MAX, then we simply stop
698 * in this case we simply stop the tick timer. 690 * the tick timer.
699 */ 691 */
700 if (unlikely(expires.tv64 == KTIME_MAX)) { 692 if (unlikely(expires == KTIME_MAX)) {
701 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 693 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
702 hrtimer_cancel(&ts->sched_timer); 694 hrtimer_cancel(&ts->sched_timer);
703 goto out; 695 goto out;
704 } 696 }
705 697
706 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 698 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
707 hrtimer_start(&ts->sched_timer, expires, 699 hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
708 HRTIMER_MODE_ABS_PINNED);
709 else 700 else
710 tick_program_event(expires, 1); 701 tick_program_event(tick, 1);
711out: 702out:
712 ts->next_jiffies = next_jiffies; 703 /* Update the estimated sleep length */
713 ts->last_jiffies = last_jiffies;
714 ts->sleep_length = ktime_sub(dev->next_event, now); 704 ts->sleep_length = ktime_sub(dev->next_event, now);
715 705 return tick;
716 return ret;
717} 706}
718 707
719static void tick_nohz_full_stop_tick(struct tick_sched *ts) 708static void tick_nohz_full_stop_tick(struct tick_sched *ts)
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1a17..42fdf4958bcc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
57 ktime_t iowait_sleeptime; 57 ktime_t iowait_sleeptime;
58 ktime_t sleep_length; 58 ktime_t sleep_length;
59 unsigned long last_jiffies; 59 unsigned long last_jiffies;
60 unsigned long next_jiffies; 60 u64 next_timer;
61 ktime_t idle_expires; 61 ktime_t idle_expires;
62 int do_timer_last; 62 int do_timer_last;
63}; 63};
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index b31f13f4fe41..172b83cd2f8e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
49#include <asm/timex.h> 49#include <asm/timex.h>
50#include <asm/io.h> 50#include <asm/io.h>
51 51
52#include "tick-internal.h"
53
52#define CREATE_TRACE_POINTS 54#define CREATE_TRACE_POINTS
53#include <trace/events/timer.h> 55#include <trace/events/timer.h>
54 56
@@ -1311,54 +1313,48 @@ cascade:
1311 * Check, if the next hrtimer event is before the next timer wheel 1313 * Check, if the next hrtimer event is before the next timer wheel
1312 * event: 1314 * event:
1313 */ 1315 */
1314static unsigned long cmp_next_hrtimer_event(unsigned long now, 1316static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
1315 unsigned long expires)
1316{ 1317{
1317 ktime_t hr_delta = hrtimer_get_next_event(); 1318 u64 nextevt = hrtimer_get_next_event();
1318 struct timespec tsdelta;
1319 unsigned long delta;
1320
1321 if (hr_delta.tv64 == KTIME_MAX)
1322 return expires;
1323 1319
1324 /* 1320 /*
1325 * Expired timer available, let it expire in the next tick 1321 * If high resolution timers are enabled
1322 * hrtimer_get_next_event() returns KTIME_MAX.
1326 */ 1323 */
1327 if (hr_delta.tv64 <= 0) 1324 if (expires <= nextevt)
1328 return now + 1; 1325 return expires;
1329
1330 tsdelta = ktime_to_timespec(hr_delta);
1331 delta = timespec_to_jiffies(&tsdelta);
1332 1326
1333 /* 1327 /*
1334 * Limit the delta to the max value, which is checked in 1328 * If the next timer is already expired, return the tick base
1335 * tick_nohz_stop_sched_tick(): 1329 * time so the tick is fired immediately.
1336 */ 1330 */
1337 if (delta > NEXT_TIMER_MAX_DELTA) 1331 if (nextevt <= basem)
1338 delta = NEXT_TIMER_MAX_DELTA; 1332 return basem;
1339 1333
1340 /* 1334 /*
1341 * Take rounding errors in to account and make sure, that it 1335 * Round up to the next jiffie. High resolution timers are
1342 * expires in the next tick. Otherwise we go into an endless 1336 * off, so the hrtimers are expired in the tick and we need to
1343 * ping pong due to tick_nohz_stop_sched_tick() retriggering 1337 * make sure that this tick really expires the timer to avoid
1344 * the timer softirq 1338 * a ping pong of the nohz stop code.
1339 *
1340 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
1345 */ 1341 */
1346 if (delta < 1) 1342 return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
1347 delta = 1;
1348 now += delta;
1349 if (time_before(now, expires))
1350 return now;
1351 return expires;
1352} 1343}
1353 1344
1354/** 1345/**
1355 * get_next_timer_interrupt - return the jiffy of the next pending timer 1346 * get_next_timer_interrupt - return the time (clock mono) of the next timer
1356 * @now: current time (in jiffies) 1347 * @basej: base time jiffies
1348 * @basem: base time clock monotonic
1349 *
1350 * Returns the tick aligned clock monotonic time of the next pending
1351 * timer or KTIME_MAX if no timer is pending.
1357 */ 1352 */
1358unsigned long get_next_timer_interrupt(unsigned long now) 1353u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
1359{ 1354{
1360 struct tvec_base *base = __this_cpu_read(tvec_bases); 1355 struct tvec_base *base = __this_cpu_read(tvec_bases);
1361 unsigned long expires = now + NEXT_TIMER_MAX_DELTA; 1356 u64 expires = KTIME_MAX;
1357 unsigned long nextevt;
1362 1358
1363 /* 1359 /*
1364 * Pretend that there is no timer pending if the cpu is offline. 1360 * Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1371 if (base->active_timers) { 1367 if (base->active_timers) {
1372 if (time_before_eq(base->next_timer, base->timer_jiffies)) 1368 if (time_before_eq(base->next_timer, base->timer_jiffies))
1373 base->next_timer = __next_timer_interrupt(base); 1369 base->next_timer = __next_timer_interrupt(base);
1374 expires = base->next_timer; 1370 nextevt = base->next_timer;
1371 if (time_before_eq(nextevt, basej))
1372 expires = basem;
1373 else
1374 expires = basem + (nextevt - basej) * TICK_NSEC;
1375 } 1375 }
1376 spin_unlock(&base->lock); 1376 spin_unlock(&base->lock);
1377 1377
1378 if (time_before_eq(expires, now)) 1378 return cmp_next_hrtimer_event(basem, expires);
1379 return now;
1380
1381 return cmp_next_hrtimer_event(now, expires);
1382} 1379}
1383#endif 1380#endif
1384 1381
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 6232fc536185..66f39bba5353 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -191,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
191 P_ns(idle_sleeptime); 191 P_ns(idle_sleeptime);
192 P_ns(iowait_sleeptime); 192 P_ns(iowait_sleeptime);
193 P(last_jiffies); 193 P(last_jiffies);
194 P(next_jiffies); 194 P(next_timer);
195 P_ns(idle_expires); 195 P_ns(idle_expires);
196 SEQ_printf(m, "jiffies: %Lu\n", 196 SEQ_printf(m, "jiffies: %Lu\n",
197 (unsigned long long)jiffies); 197 (unsigned long long)jiffies);
@@ -289,7 +289,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
289 289
290static inline void timer_list_header(struct seq_file *m, u64 now) 290static inline void timer_list_header(struct seq_file *m, u64 now)
291{ 291{
292 SEQ_printf(m, "Timer List Version: v0.7\n"); 292 SEQ_printf(m, "Timer List Version: v0.8\n");
293 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 293 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
294 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 294 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
295 SEQ_printf(m, "\n"); 295 SEQ_printf(m, "\n");