diff options
| author | John Stultz <john.stultz@linaro.org> | 2012-03-15 16:04:03 -0400 |
|---|---|---|
| committer | John Stultz <john.stultz@linaro.org> | 2012-03-22 22:43:43 -0400 |
| commit | 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d (patch) | |
| tree | 007df06a9cf0d4d2b72ed7dd8d646e853de80e9b /kernel | |
| parent | 57779dc2b3b75bee05ef5d1ada47f615f7a13932 (diff) | |
ntp: Fix leap-second hrtimer livelock
Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp
subsystem has used an hrtimer for triggering the leapsecond
adjustment. However, this can cause a potential livelock.
Thomas diagnosed this as the following pattern:
CPU 0 CPU 1
do_adjtimex()
spin_lock_irq(&ntp_lock);
process_adjtimex_modes(); timer_interrupt()
process_adj_status(); do_timer()
ntp_start_leap_timer(); write_lock(&xtime_lock);
hrtimer_start(); update_wall_time();
hrtimer_reprogram(); ntp_tick_length()
tick_program_event() spin_lock(&ntp_lock);
clockevents_program_event()
ktime_get()
seq = req_seqbegin(xtime_lock);
This patch tries to avoid the problem by reverting back to not using
an hrtimer to inject leapseconds, and instead we handle the leapsecond
processing in the second_overflow() function.
The downside to this change is that on systems that support highres
timers, the leap second processing will occur on a HZ tick boundary,
(ie: ~1-10ms, depending on HZ) after the leap second instead of
possibly sooner (~34us in my tests w/ x86_64 lapic).
This patch applies on top of tip/timers/core.
CC: Sasha Levin <levinsasha928@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Sasha Levin <levinsasha928@gmail.com>
Diagnoised-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Sasha Levin <levinsasha928@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/time/ntp.c | 128 | ||||
| -rw-r--r-- | kernel/time/timekeeping.c | 20 |
2 files changed, 47 insertions, 101 deletions
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6e039b144daf..3d17ebd47fa2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -34,8 +34,6 @@ unsigned long tick_nsec; | |||
| 34 | static u64 tick_length; | 34 | static u64 tick_length; |
| 35 | static u64 tick_length_base; | 35 | static u64 tick_length_base; |
| 36 | 36 | ||
| 37 | static struct hrtimer leap_timer; | ||
| 38 | |||
| 39 | #define MAX_TICKADJ 500LL /* usecs */ | 37 | #define MAX_TICKADJ 500LL /* usecs */ |
| 40 | #define MAX_TICKADJ_SCALED \ | 38 | #define MAX_TICKADJ_SCALED \ |
| 41 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) | 39 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
| @@ -381,70 +379,63 @@ u64 ntp_tick_length(void) | |||
| 381 | 379 | ||
| 382 | 380 | ||
| 383 | /* | 381 | /* |
| 384 | * Leap second processing. If in leap-insert state at the end of the | 382 | * this routine handles the overflow of the microsecond field |
| 385 | * day, the system clock is set back one second; if in leap-delete | 383 | * |
| 386 | * state, the system clock is set ahead one second. | 384 | * The tricky bits of code to handle the accurate clock support |
| 385 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
| 386 | * They were originally developed for SUN and DEC kernels. | ||
| 387 | * All the kudos should go to Dave for this stuff. | ||
| 388 | * | ||
| 389 | * Also handles leap second processing, and returns leap offset | ||
| 387 | */ | 390 | */ |
| 388 | static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) | 391 | int second_overflow(unsigned long secs) |
| 389 | { | 392 | { |
| 390 | enum hrtimer_restart res = HRTIMER_NORESTART; | 393 | s64 delta; |
| 391 | unsigned long flags; | ||
| 392 | int leap = 0; | 394 | int leap = 0; |
| 395 | unsigned long flags; | ||
| 393 | 396 | ||
| 394 | spin_lock_irqsave(&ntp_lock, flags); | 397 | spin_lock_irqsave(&ntp_lock, flags); |
| 398 | |||
| 399 | /* | ||
| 400 | * Leap second processing. If in leap-insert state at the end of the | ||
| 401 | * day, the system clock is set back one second; if in leap-delete | ||
| 402 | * state, the system clock is set ahead one second. | ||
| 403 | */ | ||
| 395 | switch (time_state) { | 404 | switch (time_state) { |
| 396 | case TIME_OK: | 405 | case TIME_OK: |
| 406 | if (time_status & STA_INS) | ||
| 407 | time_state = TIME_INS; | ||
| 408 | else if (time_status & STA_DEL) | ||
| 409 | time_state = TIME_DEL; | ||
| 397 | break; | 410 | break; |
| 398 | case TIME_INS: | 411 | case TIME_INS: |
| 399 | leap = -1; | 412 | if (secs % 86400 == 0) { |
| 400 | time_state = TIME_OOP; | 413 | leap = -1; |
| 401 | printk(KERN_NOTICE | 414 | time_state = TIME_OOP; |
| 402 | "Clock: inserting leap second 23:59:60 UTC\n"); | 415 | printk(KERN_NOTICE |
| 403 | hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); | 416 | "Clock: inserting leap second 23:59:60 UTC\n"); |
| 404 | res = HRTIMER_RESTART; | 417 | } |
| 405 | break; | 418 | break; |
| 406 | case TIME_DEL: | 419 | case TIME_DEL: |
| 407 | leap = 1; | 420 | if ((secs + 1) % 86400 == 0) { |
| 408 | time_tai--; | 421 | leap = 1; |
| 409 | time_state = TIME_WAIT; | 422 | time_tai--; |
| 410 | printk(KERN_NOTICE | 423 | time_state = TIME_WAIT; |
| 411 | "Clock: deleting leap second 23:59:59 UTC\n"); | 424 | printk(KERN_NOTICE |
| 425 | "Clock: deleting leap second 23:59:59 UTC\n"); | ||
| 426 | } | ||
| 412 | break; | 427 | break; |
| 413 | case TIME_OOP: | 428 | case TIME_OOP: |
| 414 | time_tai++; | 429 | time_tai++; |
| 415 | time_state = TIME_WAIT; | 430 | time_state = TIME_WAIT; |
| 416 | /* fall through */ | 431 | break; |
| 432 | |||
| 417 | case TIME_WAIT: | 433 | case TIME_WAIT: |
| 418 | if (!(time_status & (STA_INS | STA_DEL))) | 434 | if (!(time_status & (STA_INS | STA_DEL))) |
| 419 | time_state = TIME_OK; | 435 | time_state = TIME_OK; |
| 420 | break; | 436 | break; |
| 421 | } | 437 | } |
| 422 | spin_unlock_irqrestore(&ntp_lock, flags); | ||
| 423 | |||
| 424 | /* | ||
| 425 | * We have to call this outside of the ntp_lock to keep | ||
| 426 | * the proper locking hierarchy | ||
| 427 | */ | ||
| 428 | if (leap) | ||
| 429 | timekeeping_leap_insert(leap); | ||
| 430 | |||
| 431 | return res; | ||
| 432 | } | ||
| 433 | |||
| 434 | /* | ||
| 435 | * this routine handles the overflow of the microsecond field | ||
| 436 | * | ||
| 437 | * The tricky bits of code to handle the accurate clock support | ||
| 438 | * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. | ||
| 439 | * They were originally developed for SUN and DEC kernels. | ||
| 440 | * All the kudos should go to Dave for this stuff. | ||
| 441 | */ | ||
| 442 | void second_overflow(void) | ||
| 443 | { | ||
| 444 | s64 delta; | ||
| 445 | unsigned long flags; | ||
| 446 | 438 | ||
| 447 | spin_lock_irqsave(&ntp_lock, flags); | ||
| 448 | 439 | ||
| 449 | /* Bump the maxerror field */ | 440 | /* Bump the maxerror field */ |
| 450 | time_maxerror += MAXFREQ / NSEC_PER_USEC; | 441 | time_maxerror += MAXFREQ / NSEC_PER_USEC; |
| @@ -481,8 +472,13 @@ void second_overflow(void) | |||
| 481 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) | 472 | tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) |
| 482 | << NTP_SCALE_SHIFT; | 473 | << NTP_SCALE_SHIFT; |
| 483 | time_adjust = 0; | 474 | time_adjust = 0; |
| 475 | |||
| 476 | |||
| 477 | |||
| 484 | out: | 478 | out: |
| 485 | spin_unlock_irqrestore(&ntp_lock, flags); | 479 | spin_unlock_irqrestore(&ntp_lock, flags); |
| 480 | |||
| 481 | return leap; | ||
| 486 | } | 482 | } |
| 487 | 483 | ||
| 488 | #ifdef CONFIG_GENERIC_CMOS_UPDATE | 484 | #ifdef CONFIG_GENERIC_CMOS_UPDATE |
| @@ -544,27 +540,6 @@ static void notify_cmos_timer(void) | |||
| 544 | static inline void notify_cmos_timer(void) { } | 540 | static inline void notify_cmos_timer(void) { } |
| 545 | #endif | 541 | #endif |
| 546 | 542 | ||
| 547 | /* | ||
| 548 | * Start the leap seconds timer: | ||
| 549 | */ | ||
| 550 | static inline void ntp_start_leap_timer(struct timespec *ts) | ||
| 551 | { | ||
| 552 | long now = ts->tv_sec; | ||
| 553 | |||
| 554 | if (time_status & STA_INS) { | ||
| 555 | time_state = TIME_INS; | ||
| 556 | now += 86400 - now % 86400; | ||
| 557 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
| 558 | |||
| 559 | return; | ||
| 560 | } | ||
| 561 | |||
| 562 | if (time_status & STA_DEL) { | ||
| 563 | time_state = TIME_DEL; | ||
| 564 | now += 86400 - (now + 1) % 86400; | ||
| 565 | hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); | ||
| 566 | } | ||
| 567 | } | ||
| 568 | 543 | ||
| 569 | /* | 544 | /* |
| 570 | * Propagate a new txc->status value into the NTP state: | 545 | * Propagate a new txc->status value into the NTP state: |
| @@ -589,22 +564,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) | |||
| 589 | time_status &= STA_RONLY; | 564 | time_status &= STA_RONLY; |
| 590 | time_status |= txc->status & ~STA_RONLY; | 565 | time_status |= txc->status & ~STA_RONLY; |
| 591 | 566 | ||
| 592 | switch (time_state) { | ||
| 593 | case TIME_OK: | ||
| 594 | ntp_start_leap_timer(ts); | ||
| 595 | break; | ||
| 596 | case TIME_INS: | ||
| 597 | case TIME_DEL: | ||
| 598 | time_state = TIME_OK; | ||
| 599 | ntp_start_leap_timer(ts); | ||
| 600 | case TIME_WAIT: | ||
| 601 | if (!(time_status & (STA_INS | STA_DEL))) | ||
| 602 | time_state = TIME_OK; | ||
| 603 | break; | ||
| 604 | case TIME_OOP: | ||
| 605 | hrtimer_restart(&leap_timer); | ||
| 606 | break; | ||
| 607 | } | ||
| 608 | } | 567 | } |
| 609 | /* | 568 | /* |
| 610 | * Called with the xtime lock held, so we can access and modify | 569 | * Called with the xtime lock held, so we can access and modify |
| @@ -686,9 +645,6 @@ int do_adjtimex(struct timex *txc) | |||
| 686 | (txc->tick < 900000/USER_HZ || | 645 | (txc->tick < 900000/USER_HZ || |
| 687 | txc->tick > 1100000/USER_HZ)) | 646 | txc->tick > 1100000/USER_HZ)) |
| 688 | return -EINVAL; | 647 | return -EINVAL; |
| 689 | |||
| 690 | if (txc->modes & ADJ_STATUS && time_state != TIME_OK) | ||
| 691 | hrtimer_cancel(&leap_timer); | ||
| 692 | } | 648 | } |
| 693 | 649 | ||
| 694 | if (txc->modes & ADJ_SETOFFSET) { | 650 | if (txc->modes & ADJ_SETOFFSET) { |
| @@ -1010,6 +966,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); | |||
| 1010 | void __init ntp_init(void) | 966 | void __init ntp_init(void) |
| 1011 | { | 967 | { |
| 1012 | ntp_clear(); | 968 | ntp_clear(); |
| 1013 | hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); | ||
| 1014 | leap_timer.function = ntp_leap_second; | ||
| 1015 | } | 969 | } |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b53da5ecbea2..5d76e09ddd3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp) | |||
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | 186 | ||
| 187 | void timekeeping_leap_insert(int leapsecond) | ||
| 188 | { | ||
| 189 | unsigned long flags; | ||
| 190 | |||
| 191 | write_seqlock_irqsave(&timekeeper.lock, flags); | ||
| 192 | timekeeper.xtime.tv_sec += leapsecond; | ||
| 193 | timekeeper.wall_to_monotonic.tv_sec -= leapsecond; | ||
| 194 | timekeeping_update(false); | ||
| 195 | write_sequnlock_irqrestore(&timekeeper.lock, flags); | ||
| 196 | |||
| 197 | } | ||
| 198 | |||
| 199 | /** | 187 | /** |
| 200 | * timekeeping_forward_now - update clock to the current time | 188 | * timekeeping_forward_now - update clock to the current time |
| 201 | * | 189 | * |
| @@ -969,9 +957,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | |||
| 969 | 957 | ||
| 970 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | 958 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; |
| 971 | while (timekeeper.xtime_nsec >= nsecps) { | 959 | while (timekeeper.xtime_nsec >= nsecps) { |
| 960 | int leap; | ||
| 972 | timekeeper.xtime_nsec -= nsecps; | 961 | timekeeper.xtime_nsec -= nsecps; |
| 973 | timekeeper.xtime.tv_sec++; | 962 | timekeeper.xtime.tv_sec++; |
| 974 | second_overflow(); | 963 | leap = second_overflow(timekeeper.xtime.tv_sec); |
| 964 | timekeeper.xtime.tv_sec += leap; | ||
| 975 | } | 965 | } |
| 976 | 966 | ||
| 977 | /* Accumulate raw time */ | 967 | /* Accumulate raw time */ |
| @@ -1082,9 +1072,11 @@ static void update_wall_time(void) | |||
| 1082 | * xtime.tv_nsec isn't larger then NSEC_PER_SEC | 1072 | * xtime.tv_nsec isn't larger then NSEC_PER_SEC |
| 1083 | */ | 1073 | */ |
| 1084 | if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { | 1074 | if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { |
| 1075 | int leap; | ||
| 1085 | timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; | 1076 | timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; |
| 1086 | timekeeper.xtime.tv_sec++; | 1077 | timekeeper.xtime.tv_sec++; |
| 1087 | second_overflow(); | 1078 | leap = second_overflow(timekeeper.xtime.tv_sec); |
| 1079 | timekeeper.xtime.tv_sec += leap; | ||
| 1088 | } | 1080 | } |
| 1089 | 1081 | ||
| 1090 | timekeeping_update(false); | 1082 | timekeeping_update(false); |
