aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Hunter <jon-hunter@ti.com>2009-08-18 13:45:10 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-11-13 14:46:24 -0500
commit98962465ed9e6ea99c38e0af63fe1dcb5a79dc25 (patch)
treef3f69ad8f6cd47e72a75da6de49eb3402f15cd9b
parent529eaccd900a59724619b4a6ef6579fd518d5218 (diff)
nohz: Prevent clocksource wrapping during idle
The dynamic tick allows the kernel to sleep for periods longer than a single tick, but it does not limit the sleep time currently. In the worst case the kernel could sleep longer than the wrap around time of the time keeping clock source which would result in losing track of time. Prevent this by limiting it to the safe maximum sleep time of the current time keeping clock source. The value is calculated when the clock source is registered. [ tglx: simplified the code a bit and massaged the commit msg ] Signed-off-by: Jon Hunter <jon-hunter@ti.com> Cc: John Stultz <johnstul@us.ibm.com> LKML-Reference: <1250617512-23567-2-git-send-email-jon-hunter@ti.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r--include/linux/clocksource.h2
-rw-r--r--include/linux/time.h1
-rw-r--r--kernel/time/clocksource.c44
-rw-r--r--kernel/time/tick-sched.c52
-rw-r--r--kernel/time/timekeeping.c11
5 files changed, 96 insertions, 14 deletions
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f57f88250526..279c5478e8a6 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -151,6 +151,7 @@ extern u64 timecounter_cyc2time(struct timecounter *tc,
151 * subtraction of non 64 bit counters 151 * subtraction of non 64 bit counters
152 * @mult: cycle to nanosecond multiplier 152 * @mult: cycle to nanosecond multiplier
153 * @shift: cycle to nanosecond divisor (power of two) 153 * @shift: cycle to nanosecond divisor (power of two)
154 * @max_idle_ns: max idle time permitted by the clocksource (nsecs)
154 * @flags: flags describing special properties 155 * @flags: flags describing special properties
155 * @vread: vsyscall based read 156 * @vread: vsyscall based read
156 * @resume: resume function for the clocksource, if necessary 157 * @resume: resume function for the clocksource, if necessary
@@ -168,6 +169,7 @@ struct clocksource {
168 cycle_t mask; 169 cycle_t mask;
169 u32 mult; 170 u32 mult;
170 u32 shift; 171 u32 shift;
172 u64 max_idle_ns;
171 unsigned long flags; 173 unsigned long flags;
172 cycle_t (*vread)(void); 174 cycle_t (*vread)(void);
173 void (*resume)(void); 175 void (*resume)(void);
diff --git a/include/linux/time.h b/include/linux/time.h
index fe04e5ef6a59..6e026e45a179 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -148,6 +148,7 @@ extern void monotonic_to_bootbased(struct timespec *ts);
148 148
149extern struct timespec timespec_trunc(struct timespec t, unsigned gran); 149extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
150extern int timekeeping_valid_for_hres(void); 150extern int timekeeping_valid_for_hres(void);
151extern u64 timekeeping_max_deferment(void);
151extern void update_wall_time(void); 152extern void update_wall_time(void);
152extern void update_xtime_cache(u64 nsec); 153extern void update_xtime_cache(u64 nsec);
153extern void timekeeping_leap_insert(int leapsecond); 154extern void timekeeping_leap_insert(int leapsecond);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 407c0894ef37..b65b242f04dd 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -469,6 +469,47 @@ void clocksource_touch_watchdog(void)
469#ifdef CONFIG_GENERIC_TIME 469#ifdef CONFIG_GENERIC_TIME
470 470
471/** 471/**
472 * clocksource_max_deferment - Returns max time the clocksource can be deferred
473 * @cs: Pointer to clocksource
474 *
475 */
476static u64 clocksource_max_deferment(struct clocksource *cs)
477{
478 u64 max_nsecs, max_cycles;
479
480 /*
481 * Calculate the maximum number of cycles that we can pass to the
482 * cyc2ns function without overflowing a 64-bit signed result. The
483 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
484 * is equivalent to the below.
485 * max_cycles < (2^63)/cs->mult
486 * max_cycles < 2^(log2((2^63)/cs->mult))
487 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
488 * max_cycles < 2^(63 - log2(cs->mult))
489 * max_cycles < 1 << (63 - log2(cs->mult))
490 * Please note that we add 1 to the result of the log2 to account for
491 * any rounding errors, ensure the above inequality is satisfied and
492 * no overflow will occur.
493 */
494 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
495
496 /*
497 * The actual maximum number of cycles we can defer the clocksource is
498 * determined by the minimum of max_cycles and cs->mask.
499 */
500 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
501 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
502
503 /*
504 * To ensure that the clocksource does not wrap whilst we are idle,
505 * limit the time the clocksource can be deferred by 12.5%. Please
506 * note a margin of 12.5% is used because this can be computed with
507 * a shift, versus say 10% which would require division.
508 */
509 return max_nsecs - (max_nsecs >> 5);
510}
511
512/**
472 * clocksource_select - Select the best clocksource available 513 * clocksource_select - Select the best clocksource available
473 * 514 *
474 * Private function. Must hold clocksource_mutex when called. 515 * Private function. Must hold clocksource_mutex when called.
@@ -564,6 +605,9 @@ static void clocksource_enqueue(struct clocksource *cs)
564 */ 605 */
565int clocksource_register(struct clocksource *cs) 606int clocksource_register(struct clocksource *cs)
566{ 607{
608 /* calculate max idle time permitted for this clocksource */
609 cs->max_idle_ns = clocksource_max_deferment(cs);
610
567 mutex_lock(&clocksource_mutex); 611 mutex_lock(&clocksource_mutex);
568 clocksource_enqueue(cs); 612 clocksource_enqueue(cs);
569 clocksource_select(); 613 clocksource_select();
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c65ba0faa98f..a80b4644fe6b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -208,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
208 struct tick_sched *ts; 208 struct tick_sched *ts;
209 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
211 int cpu; 212 int cpu;
212 213
213 local_irq_save(flags); 214 local_irq_save(flags);
@@ -262,6 +263,17 @@ void tick_nohz_stop_sched_tick(int inidle)
262 seq = read_seqbegin(&xtime_lock); 263 seq = read_seqbegin(&xtime_lock);
263 last_update = last_jiffies_update; 264 last_update = last_jiffies_update;
264 last_jiffies = jiffies; 265 last_jiffies = jiffies;
266
267 /*
268 * On SMP we really should only care for the CPU which
269 * has the do_timer duty assigned. All other CPUs can
270 * sleep as long as they want.
271 */
272 if (cpu == tick_do_timer_cpu ||
273 tick_do_timer_cpu == TICK_DO_TIMER_NONE)
274 time_delta = timekeeping_max_deferment();
275 else
276 time_delta = KTIME_MAX;
265 } while (read_seqretry(&xtime_lock, seq)); 277 } while (read_seqretry(&xtime_lock, seq));
266 278
267 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 279 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
@@ -284,11 +296,26 @@ void tick_nohz_stop_sched_tick(int inidle)
284 if ((long)delta_jiffies >= 1) { 296 if ((long)delta_jiffies >= 1) {
285 297
286 /* 298 /*
287 * calculate the expiry time for the next timer wheel 299 * calculate the expiry time for the next timer wheel
288 * timer 300 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
289 */ 301 * that there is no timer pending or at least extremely
290 expires = ktime_add_ns(last_update, tick_period.tv64 * 302 * far into the future (12 days for HZ=1000). In this
291 delta_jiffies); 303 * case we set the expiry to the end of time.
304 */
305 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
306 /*
307 * Calculate the time delta for the next timer event.
308 * If the time delta exceeds the maximum time delta
309 * permitted by the current clocksource then adjust
310 * the time delta accordingly to ensure the
311 * clocksource does not wrap.
312 */
313 time_delta = min_t(u64, time_delta,
314 tick_period.tv64 * delta_jiffies);
315 expires = ktime_add_ns(last_update, time_delta);
316 } else {
317 expires.tv64 = KTIME_MAX;
318 }
292 319
293 /* 320 /*
294 * If this cpu is the one which updates jiffies, then 321 * If this cpu is the one which updates jiffies, then
@@ -332,22 +359,19 @@ void tick_nohz_stop_sched_tick(int inidle)
332 359
333 ts->idle_sleeps++; 360 ts->idle_sleeps++;
334 361
362 /* Mark expires */
363 ts->idle_expires = expires;
364
335 /* 365 /*
336 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 366 * If the expiration time == KTIME_MAX, then
337 * there is no timer pending or at least extremly far 367 * in this case we simply stop the tick timer.
338 * into the future (12 days for HZ=1000). In this case
339 * we simply stop the tick timer:
340 */ 368 */
341 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 369 if (unlikely(expires.tv64 == KTIME_MAX)) {
342 ts->idle_expires.tv64 = KTIME_MAX;
343 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 370 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
344 hrtimer_cancel(&ts->sched_timer); 371 hrtimer_cancel(&ts->sched_timer);
345 goto out; 372 goto out;
346 } 373 }
347 374
348 /* Mark expiries */
349 ts->idle_expires = expires;
350
351 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 375 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
352 hrtimer_start(&ts->sched_timer, expires, 376 hrtimer_start(&ts->sched_timer, expires,
353 HRTIMER_MODE_ABS_PINNED); 377 HRTIMER_MODE_ABS_PINNED);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 96b3f0dfa5dc..5d4d4239a0aa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -478,6 +478,17 @@ int timekeeping_valid_for_hres(void)
478} 478}
479 479
480/** 480/**
481 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
482 *
483 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
484 * ensure that the clocksource does not change!
485 */
486u64 timekeeping_max_deferment(void)
487{
488 return timekeeper.clock->max_idle_ns;
489}
490
491/**
481 * read_persistent_clock - Return time from the persistent clock. 492 * read_persistent_clock - Return time from the persistent clock.
482 * 493 *
483 * Weak dummy function for arches that do not yet support it. 494 * Weak dummy function for arches that do not yet support it.