diff options
author | John Stultz <john.stultz@linaro.org> | 2012-07-17 13:33:48 -0400 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2012-07-19 11:58:22 -0400 |
commit | 9c24771f844b6f0708a72cd116953e0a128e5d2a (patch) | |
tree | bdbee9c35ca47d6b023a9d86f0e15cb508fe7508 /kernel | |
parent | 31b83ef7cfda5a7b74446ca70c1e231b24450cbd (diff) |
ntp: Fix leap-second hrtimer livelock
This is a backport of 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d
This should have been backported when it was commited, but I
mistook the problem as requiring the ntp_lock changes
that landed in 3.4 in order for it to occur.
Unfortunately the same issue can happen (with only one cpu)
as follows:
do_adjtimex()
write_seqlock_irq(&xtime_lock);
process_adjtimex_modes()
process_adj_status()
ntp_start_leap_timer()
hrtimer_start()
hrtimer_reprogram()
tick_program_event()
clockevents_program_event()
ktime_get()
seq = req_seqbegin(xtime_lock); [DEADLOCK]
This deadlock will no always occur, as it requires the
leap_timer to force a hrtimer_reprogram which only happens
if its set and there's no sooner timer to expire.
NOTE: This patch, being faithful to the original commit,
introduces a bug (we don't update wall_to_monotonic),
which will be resovled by backporting a following fix.
Original commit message below:
Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp
subsystem has used an hrtimer for triggering the leapsecond
adjustment. However, this can cause a potential livelock.
Thomas diagnosed this as the following pattern:
CPU 0 CPU 1
do_adjtimex()
spin_lock_irq(&ntp_lock);
process_adjtimex_modes(); timer_interrupt()
process_adj_status(); do_timer()
ntp_start_leap_timer(); write_lock(&xtime_lock);
hrtimer_start(); update_wall_time();
hrtimer_reprogram(); ntp_tick_length()
tick_program_event() spin_lock(&ntp_lock);
clockevents_program_event()
ktime_get()
seq = req_seqbegin(xtime_lock);
This patch tries to avoid the problem by reverting back to not using
an hrtimer to inject leapseconds, and instead we handle the leapsecond
processing in the second_overflow() function.
The downside to this change is that on systems that support highres
timers, the leap second processing will occur on a HZ tick boundary,
(ie: ~1-10ms, depending on HZ) after the leap second instead of
possibly sooner (~34us in my tests w/ x86_64 lapic).
This patch applies on top of tip/timers/core.
CC: Sasha Levin <levinsasha928@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Diagnoised-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/time/ntp.c | 122 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 18 |
2 files changed, 47 insertions, 93 deletions
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4b85a7a7252..4508f7f68a7 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -31,8 +31,6 @@ unsigned long tick_nsec; | |||
31 | u64 tick_length; | 31 | u64 tick_length; |
32 | static u64 tick_length_base; | 32 | static u64 tick_length_base; |
33 | 33 | ||
34 | static struct hrtimer leap_timer; | ||
35 | |||
36 | #define MAX_TICKADJ 500LL /* usecs */ | 34 | #define MAX_TICKADJ 500LL /* usecs */ |
37 | #define MAX_TICKADJ_SCALED \ | 35 | #define MAX_TICKADJ_SCALED \ |
38 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) | 36 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
@@ -350,60 +348,60 @@ void ntp_clear(void) | |||
350 | } | 348 | } |
351 | 349 | ||
352 | /* | 350 | /* |
353 | * Leap second processing. If in leap-insert state at the end of the | 351 | * this routine handles the overflow of the microsecond field |
354 | * day, the system clock is set back one second; if in leap-delete | 352 | * |
355 | * state, the system clock is set ahead one second. | 353 | * The tricky bits of code to handle the accurate clock support |
354 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
355 | * They were originally developed for SUN and DEC kernels. | ||
356 | * All the kudos should go to Dave for this stuff. | ||
357 | * | ||
358 | * Also handles leap second processing, and returns leap offset | ||
356 | */ | 359 | */ |
357 | static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) | 360 | int second_overflow(unsigned long secs) |
358 | { | 361 | { |
359 | enum hrtimer_restart res = HRTIMER_NORESTART; | 362 | int leap = 0; |
360 | 363 | s64 delta; | |
361 | write_seqlock(&xtime_lock); | ||
362 | 364 | ||
365 | /* | ||
366 | * Leap second processing. If in leap-insert state at the end of the | ||
367 | * day, the system clock is set back one second; if in leap-delete | ||
368 | * state, the system clock is set ahead one second. | ||
369 | */ | ||
363 | switch (time_state) { | 370 | switch (time_state) { |
364 | case TIME_OK: | 371 | case TIME_OK: |
372 | if (time_status & STA_INS) | ||
373 | time_state = TIME_INS; | ||
374 | else if (time_status & STA_DEL) | ||
375 | time_state = TIME_DEL; | ||
365 | break; | 376 | break; |
366 | case TIME_INS: | 377 | case TIME_INS: |
367 | timekeeping_leap_insert(-1); | 378 | if (secs % 86400 == 0) { |
368 | time_state = TIME_OOP; | 379 | leap = -1; |
369 | printk(KERN_NOTICE | 380 | time_state = TIME_OOP; |
370 | "Clock: inserting leap second 23:59:60 UTC\n"); | 381 | printk(KERN_NOTICE |
371 | hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); | 382 | "Clock: inserting leap second 23:59:60 UTC\n"); |
372 | res = HRTIMER_RESTART; | 383 | } |
373 | break; | 384 | break; |
374 | case TIME_DEL: | 385 | case TIME_DEL: |
375 | timekeeping_leap_insert(1); | 386 | if ((secs + 1) % 86400 == 0) { |
376 | time_tai--; | 387 | leap = 1; |
377 | time_state = TIME_WAIT; | 388 | time_tai--; |
378 | printk(KERN_NOTICE | 389 | time_state = TIME_WAIT; |
379 | "Clock: deleting leap second 23:59:59 UTC\n"); | 390 | printk(KERN_NOTICE |
391 | "Clock: deleting leap second 23:59:59 UTC\n"); | ||
392 | } | ||
380 | break; | 393 | break; |
381 | case TIME_OOP: | 394 | case TIME_OOP: |
382 | time_tai++; | 395 | time_tai++; |
383 | time_state = TIME_WAIT; | 396 | time_state = TIME_WAIT; |
384 | /* fall through */ | 397 | break; |
398 | |||
385 | case TIME_WAIT: | 399 | case TIME_WAIT: |
386 | if (!(time_status & (STA_INS | STA_DEL))) | 400 | if (!(time_status & (STA_INS | STA_DEL))) |
387 | time_state = TIME_OK; | 401 | time_state = TIME_OK; |
388 | break; | 402 | break; |
389 | } | 403 | } |
390 | 404 | ||
391 | write_sequnlock(&xtime_lock); | ||
392 | |||
393 | return res; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * this routine handles the overflow of the microsecond field | ||
398 | * | ||
399 | * The tricky bits of code to handle the accurate clock support | ||
400 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
401 | * They were originally developed for SUN and DEC kernels. | ||
402 | * All the kudos should go to Dave for this stuff. | ||
403 | */ | ||
404 | void second_overflow(void) | ||
405 | { | ||
406 | s64 delta; | ||
407 | 405 | ||
408 | /* Bump the maxerror field */ | 406 | /* Bump the maxerror field */ |
409 | time_maxerror += MAXFREQ / NSEC_PER_USEC; | 407 | time_maxerror += MAXFREQ / NSEC_PER_USEC; |
@@ -423,23 +421,25 @@ void second_overflow(void) | |||
423 | pps_dec_valid(); | 421 | pps_dec_valid(); |
424 | 422 | ||
425 | if (!time_adjust) | 423 | if (!time_adjust) |
426 | return; | 424 | goto out; |
427 | 425 | ||
428 | if (time_adjust > MAX_TICKADJ) { | 426 | if (time_adjust > MAX_TICKADJ) { |
429 | time_adjust -= MAX_TICKADJ; | 427 | time_adjust -= MAX_TICKADJ; |
430 | tick_length += MAX_TICKADJ_SCALED; | 428 | tick_length += MAX_TICKADJ_SCALED; |
431 | return; | 429 | goto out; |
432 | } | 430 | } |
433 | 431 | ||
434 | if (time_adjust < -MAX_TICKADJ) { | 432 | if (time_adjust < -MAX_TICKADJ) { |
435 | time_adjust += MAX_TICKADJ; | 433 | time_adjust += MAX_TICKADJ; |
436 | tick_length -= MAX_TICKADJ_SCALED; | 434 | tick_length -= MAX_TICKADJ_SCALED; |
437 | return; | 435 | goto out; |
438 | } | 436 | } |
439 | 437 | ||
440 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) | 438 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) |
441 | << NTP_SCALE_SHIFT; | 439 | << NTP_SCALE_SHIFT; |
442 | time_adjust = 0; | 440 | time_adjust = 0; |
441 | out: | ||
442 | return leap; | ||
443 | } | 443 | } |
444 | 444 | ||
445 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 445 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
@@ -501,27 +501,6 @@ static void notify_cmos_timer(void) | |||
501 | static inline void notify_cmos_timer(void) { } | 501 | static inline void notify_cmos_timer(void) { } |
502 | #endif | 502 | #endif |
503 | 503 | ||
504 | /* | ||
505 | * Start the leap seconds timer: | ||
506 | */ | ||
507 | static inline void ntp_start_leap_timer(struct timespec *ts) | ||
508 | { | ||
509 | long now = ts->tv_sec; | ||
510 | |||
511 | if (time_status & STA_INS) { | ||
512 | time_state = TIME_INS; | ||
513 | now += 86400 - now % 86400; | ||
514 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
515 | |||
516 | return; | ||
517 | } | ||
518 | |||
519 | if (time_status & STA_DEL) { | ||
520 | time_state = TIME_DEL; | ||
521 | now += 86400 - (now + 1) % 86400; | ||
522 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
523 | } | ||
524 | } | ||
525 | 504 | ||
526 | /* | 505 | /* |
527 | * Propagate a new txc->status value into the NTP state: | 506 | * Propagate a new txc->status value into the NTP state: |
@@ -546,22 +525,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
546 | time_status &= STA_RONLY; | 525 | time_status &= STA_RONLY; |
547 | time_status |= txc->status & ~STA_RONLY; | 526 | time_status |= txc->status & ~STA_RONLY; |
548 | 527 | ||
549 | switch (time_state) { | ||
550 | case TIME_OK: | ||
551 | ntp_start_leap_timer(ts); | ||
552 | break; | ||
553 | case TIME_INS: | ||
554 | case TIME_DEL: | ||
555 | time_state = TIME_OK; | ||
556 | ntp_start_leap_timer(ts); | ||
557 | case TIME_WAIT: | ||
558 | if (!(time_status & (STA_INS | STA_DEL))) | ||
559 | time_state = TIME_OK; | ||
560 | break; | ||
561 | case TIME_OOP: | ||
562 | hrtimer_restart(&leap_timer); | ||
563 | break; | ||
564 | } | ||
565 | } | 528 | } |
566 | /* | 529 | /* |
567 | * Called with the xtime lock held, so we can access and modify | 530 | * Called with the xtime lock held, so we can access and modify |
@@ -643,9 +606,6 @@ int do_adjtimex(struct timex *txc) | |||
643 | (txc->tick < 900000/USER_HZ || | 606 | (txc->tick < 900000/USER_HZ || |
644 | txc->tick > 1100000/USER_HZ)) | 607 | txc->tick > 1100000/USER_HZ)) |
645 | return -EINVAL; | 608 | return -EINVAL; |
646 | |||
647 | if (txc->modes & ADJ_STATUS && time_state != TIME_OK) | ||
648 | hrtimer_cancel(&leap_timer); | ||
649 | } | 609 | } |
650 | 610 | ||
651 | if (txc->modes & ADJ_SETOFFSET) { | 611 | if (txc->modes & ADJ_SETOFFSET) { |
@@ -967,6 +927,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); | |||
967 | void __init ntp_init(void) | 927 | void __init ntp_init(void) |
968 | { | 928 | { |
969 | ntp_clear(); | 929 | ntp_clear(); |
970 | hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | ||
971 | leap_timer.function = ntp_leap_second; | ||
972 | } | 930 | } |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5f458310668..c444da085a2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -169,15 +169,6 @@ static struct timespec raw_time; | |||
169 | /* flag for if timekeeping is suspended */ | 169 | /* flag for if timekeeping is suspended */ |
170 | int __read_mostly timekeeping_suspended; | 170 | int __read_mostly timekeeping_suspended; |
171 | 171 | ||
172 | /* must hold xtime_lock */ | ||
173 | void timekeeping_leap_insert(int leapsecond) | ||
174 | { | ||
175 | xtime.tv_sec += leapsecond; | ||
176 | wall_to_monotonic.tv_sec -= leapsecond; | ||
177 | update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, | ||
178 | timekeeper.mult); | ||
179 | } | ||
180 | |||
181 | /** | 172 | /** |
182 | * timekeeping_forward_now - update clock to the current time | 173 | * timekeeping_forward_now - update clock to the current time |
183 | * | 174 | * |
@@ -828,9 +819,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
828 | 819 | ||
829 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | 820 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; |
830 | while (timekeeper.xtime_nsec >= nsecps) { | 821 | while (timekeeper.xtime_nsec >= nsecps) { |
822 | int leap; | ||
831 | timekeeper.xtime_nsec -= nsecps; | 823 | timekeeper.xtime_nsec -= nsecps; |
832 | xtime.tv_sec++; | 824 | xtime.tv_sec++; |
833 | second_overflow(); | 825 | leap = second_overflow(xtime.tv_sec); |
826 | xtime.tv_sec += leap; | ||
834 | } | 827 | } |
835 | 828 | ||
836 | /* Accumulate raw time */ | 829 | /* Accumulate raw time */ |
@@ -936,9 +929,12 @@ static void update_wall_time(void) | |||
936 | * xtime.tv_nsec isn't larger then NSEC_PER_SEC | 929 | * xtime.tv_nsec isn't larger then NSEC_PER_SEC |
937 | */ | 930 | */ |
938 | if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { | 931 | if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { |
932 | int leap; | ||
939 | xtime.tv_nsec -= NSEC_PER_SEC; | 933 | xtime.tv_nsec -= NSEC_PER_SEC; |
940 | xtime.tv_sec++; | 934 | xtime.tv_sec++; |
941 | second_overflow(); | 935 | leap = second_overflow(xtime.tv_sec); |
936 | xtime.tv_sec += leap; | ||
937 | |||
942 | } | 938 | } |
943 | 939 | ||
944 | /* check to see if there is a new clocksource to use */ | 940 | /* check to see if there is a new clocksource to use */ |