aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJohn Stultz <john.stultz@linaro.org>2012-07-17 13:33:48 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2012-07-19 11:58:22 -0400
commit9c24771f844b6f0708a72cd116953e0a128e5d2a (patch)
treebdbee9c35ca47d6b023a9d86f0e15cb508fe7508 /kernel
parent31b83ef7cfda5a7b74446ca70c1e231b24450cbd (diff)
ntp: Fix leap-second hrtimer livelock
This is a backport of 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d This should have been backported when it was commited, but I mistook the problem as requiring the ntp_lock changes that landed in 3.4 in order for it to occur. Unfortunately the same issue can happen (with only one cpu) as follows: do_adjtimex() write_seqlock_irq(&xtime_lock); process_adjtimex_modes() process_adj_status() ntp_start_leap_timer() hrtimer_start() hrtimer_reprogram() tick_program_event() clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); [DEADLOCK] This deadlock will no always occur, as it requires the leap_timer to force a hrtimer_reprogram which only happens if its set and there's no sooner timer to expire. NOTE: This patch, being faithful to the original commit, introduces a bug (we don't update wall_to_monotonic), which will be resovled by backporting a following fix. Original commit message below: Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp subsystem has used an hrtimer for triggering the leapsecond adjustment. However, this can cause a potential livelock. Thomas diagnosed this as the following pattern: CPU 0 CPU 1 do_adjtimex() spin_lock_irq(&ntp_lock); process_adjtimex_modes(); timer_interrupt() process_adj_status(); do_timer() ntp_start_leap_timer(); write_lock(&xtime_lock); hrtimer_start(); update_wall_time(); hrtimer_reprogram(); ntp_tick_length() tick_program_event() spin_lock(&ntp_lock); clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); This patch tries to avoid the problem by reverting back to not using an hrtimer to inject leapseconds, and instead we handle the leapsecond processing in the second_overflow() function. The downside to this change is that on systems that support highres timers, the leap second processing will occur on a HZ tick boundary, (ie: ~1-10ms, depending on HZ) after the leap second instead of possibly sooner (~34us in my tests w/ x86_64 lapic). This patch applies on top of tip/timers/core. CC: Sasha Levin <levinsasha928@gmail.com> CC: Thomas Gleixner <tglx@linutronix.de> Reported-by: Sasha Levin <levinsasha928@gmail.com> Diagnoised-by: Thomas Gleixner <tglx@linutronix.de> Tested-by: Sasha Levin <levinsasha928@gmail.com> Cc: Prarit Bhargava <prarit@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: John Stultz <john.stultz@linaro.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/time/ntp.c122
-rw-r--r--kernel/time/timekeeping.c18
2 files changed, 47 insertions, 93 deletions
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4b85a7a7252..4508f7f68a7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,8 +31,6 @@ unsigned long tick_nsec;
31u64 tick_length; 31u64 tick_length;
32static u64 tick_length_base; 32static u64 tick_length_base;
33 33
34static struct hrtimer leap_timer;
35
36#define MAX_TICKADJ 500LL /* usecs */ 34#define MAX_TICKADJ 500LL /* usecs */
37#define MAX_TICKADJ_SCALED \ 35#define MAX_TICKADJ_SCALED \
38 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 36 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -350,60 +348,60 @@ void ntp_clear(void)
350} 348}
351 349
352/* 350/*
353 * Leap second processing. If in leap-insert state at the end of the 351 * this routine handles the overflow of the microsecond field
354 * day, the system clock is set back one second; if in leap-delete 352 *
355 * state, the system clock is set ahead one second. 353 * The tricky bits of code to handle the accurate clock support
354 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
355 * They were originally developed for SUN and DEC kernels.
356 * All the kudos should go to Dave for this stuff.
357 *
358 * Also handles leap second processing, and returns leap offset
356 */ 359 */
357static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) 360int second_overflow(unsigned long secs)
358{ 361{
359 enum hrtimer_restart res = HRTIMER_NORESTART; 362 int leap = 0;
360 363 s64 delta;
361 write_seqlock(&xtime_lock);
362 364
365 /*
366 * Leap second processing. If in leap-insert state at the end of the
367 * day, the system clock is set back one second; if in leap-delete
368 * state, the system clock is set ahead one second.
369 */
363 switch (time_state) { 370 switch (time_state) {
364 case TIME_OK: 371 case TIME_OK:
372 if (time_status & STA_INS)
373 time_state = TIME_INS;
374 else if (time_status & STA_DEL)
375 time_state = TIME_DEL;
365 break; 376 break;
366 case TIME_INS: 377 case TIME_INS:
367 timekeeping_leap_insert(-1); 378 if (secs % 86400 == 0) {
368 time_state = TIME_OOP; 379 leap = -1;
369 printk(KERN_NOTICE 380 time_state = TIME_OOP;
370 "Clock: inserting leap second 23:59:60 UTC\n"); 381 printk(KERN_NOTICE
371 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 382 "Clock: inserting leap second 23:59:60 UTC\n");
372 res = HRTIMER_RESTART; 383 }
373 break; 384 break;
374 case TIME_DEL: 385 case TIME_DEL:
375 timekeeping_leap_insert(1); 386 if ((secs + 1) % 86400 == 0) {
376 time_tai--; 387 leap = 1;
377 time_state = TIME_WAIT; 388 time_tai--;
378 printk(KERN_NOTICE 389 time_state = TIME_WAIT;
379 "Clock: deleting leap second 23:59:59 UTC\n"); 390 printk(KERN_NOTICE
391 "Clock: deleting leap second 23:59:59 UTC\n");
392 }
380 break; 393 break;
381 case TIME_OOP: 394 case TIME_OOP:
382 time_tai++; 395 time_tai++;
383 time_state = TIME_WAIT; 396 time_state = TIME_WAIT;
384 /* fall through */ 397 break;
398
385 case TIME_WAIT: 399 case TIME_WAIT:
386 if (!(time_status & (STA_INS | STA_DEL))) 400 if (!(time_status & (STA_INS | STA_DEL)))
387 time_state = TIME_OK; 401 time_state = TIME_OK;
388 break; 402 break;
389 } 403 }
390 404
391 write_sequnlock(&xtime_lock);
392
393 return res;
394}
395
396/*
397 * this routine handles the overflow of the microsecond field
398 *
399 * The tricky bits of code to handle the accurate clock support
400 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
401 * They were originally developed for SUN and DEC kernels.
402 * All the kudos should go to Dave for this stuff.
403 */
404void second_overflow(void)
405{
406 s64 delta;
407 405
408 /* Bump the maxerror field */ 406 /* Bump the maxerror field */
409 time_maxerror += MAXFREQ / NSEC_PER_USEC; 407 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,23 +421,25 @@ void second_overflow(void)
423 pps_dec_valid(); 421 pps_dec_valid();
424 422
425 if (!time_adjust) 423 if (!time_adjust)
426 return; 424 goto out;
427 425
428 if (time_adjust > MAX_TICKADJ) { 426 if (time_adjust > MAX_TICKADJ) {
429 time_adjust -= MAX_TICKADJ; 427 time_adjust -= MAX_TICKADJ;
430 tick_length += MAX_TICKADJ_SCALED; 428 tick_length += MAX_TICKADJ_SCALED;
431 return; 429 goto out;
432 } 430 }
433 431
434 if (time_adjust < -MAX_TICKADJ) { 432 if (time_adjust < -MAX_TICKADJ) {
435 time_adjust += MAX_TICKADJ; 433 time_adjust += MAX_TICKADJ;
436 tick_length -= MAX_TICKADJ_SCALED; 434 tick_length -= MAX_TICKADJ_SCALED;
437 return; 435 goto out;
438 } 436 }
439 437
440 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) 438 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
441 << NTP_SCALE_SHIFT; 439 << NTP_SCALE_SHIFT;
442 time_adjust = 0; 440 time_adjust = 0;
441out:
442 return leap;
443} 443}
444 444
445#ifdef CONFIG_GENERIC_CMOS_UPDATE 445#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -501,27 +501,6 @@ static void notify_cmos_timer(void)
501static inline void notify_cmos_timer(void) { } 501static inline void notify_cmos_timer(void) { }
502#endif 502#endif
503 503
504/*
505 * Start the leap seconds timer:
506 */
507static inline void ntp_start_leap_timer(struct timespec *ts)
508{
509 long now = ts->tv_sec;
510
511 if (time_status & STA_INS) {
512 time_state = TIME_INS;
513 now += 86400 - now % 86400;
514 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
515
516 return;
517 }
518
519 if (time_status & STA_DEL) {
520 time_state = TIME_DEL;
521 now += 86400 - (now + 1) % 86400;
522 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
523 }
524}
525 504
526/* 505/*
527 * Propagate a new txc->status value into the NTP state: 506 * Propagate a new txc->status value into the NTP state:
@@ -546,22 +525,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
546 time_status &= STA_RONLY; 525 time_status &= STA_RONLY;
547 time_status |= txc->status & ~STA_RONLY; 526 time_status |= txc->status & ~STA_RONLY;
548 527
549 switch (time_state) {
550 case TIME_OK:
551 ntp_start_leap_timer(ts);
552 break;
553 case TIME_INS:
554 case TIME_DEL:
555 time_state = TIME_OK;
556 ntp_start_leap_timer(ts);
557 case TIME_WAIT:
558 if (!(time_status & (STA_INS | STA_DEL)))
559 time_state = TIME_OK;
560 break;
561 case TIME_OOP:
562 hrtimer_restart(&leap_timer);
563 break;
564 }
565} 528}
566/* 529/*
567 * Called with the xtime lock held, so we can access and modify 530 * Called with the xtime lock held, so we can access and modify
@@ -643,9 +606,6 @@ int do_adjtimex(struct timex *txc)
643 (txc->tick < 900000/USER_HZ || 606 (txc->tick < 900000/USER_HZ ||
644 txc->tick > 1100000/USER_HZ)) 607 txc->tick > 1100000/USER_HZ))
645 return -EINVAL; 608 return -EINVAL;
646
647 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
648 hrtimer_cancel(&leap_timer);
649 } 609 }
650 610
651 if (txc->modes & ADJ_SETOFFSET) { 611 if (txc->modes & ADJ_SETOFFSET) {
@@ -967,6 +927,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
967void __init ntp_init(void) 927void __init ntp_init(void)
968{ 928{
969 ntp_clear(); 929 ntp_clear();
970 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
971 leap_timer.function = ntp_leap_second;
972} 930}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5f458310668..c444da085a2 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -169,15 +169,6 @@ static struct timespec raw_time;
169/* flag for if timekeeping is suspended */ 169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended; 170int __read_mostly timekeeping_suspended;
171 171
172/* must hold xtime_lock */
173void timekeeping_leap_insert(int leapsecond)
174{
175 xtime.tv_sec += leapsecond;
176 wall_to_monotonic.tv_sec -= leapsecond;
177 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
178 timekeeper.mult);
179}
180
181/** 172/**
182 * timekeeping_forward_now - update clock to the current time 173 * timekeeping_forward_now - update clock to the current time
183 * 174 *
@@ -828,9 +819,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
828 819
829 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 820 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
830 while (timekeeper.xtime_nsec >= nsecps) { 821 while (timekeeper.xtime_nsec >= nsecps) {
822 int leap;
831 timekeeper.xtime_nsec -= nsecps; 823 timekeeper.xtime_nsec -= nsecps;
832 xtime.tv_sec++; 824 xtime.tv_sec++;
833 second_overflow(); 825 leap = second_overflow(xtime.tv_sec);
826 xtime.tv_sec += leap;
834 } 827 }
835 828
836 /* Accumulate raw time */ 829 /* Accumulate raw time */
@@ -936,9 +929,12 @@ static void update_wall_time(void)
936 * xtime.tv_nsec isn't larger then NSEC_PER_SEC 929 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
937 */ 930 */
938 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { 931 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
932 int leap;
939 xtime.tv_nsec -= NSEC_PER_SEC; 933 xtime.tv_nsec -= NSEC_PER_SEC;
940 xtime.tv_sec++; 934 xtime.tv_sec++;
941 second_overflow(); 935 leap = second_overflow(xtime.tv_sec);
936 xtime.tv_sec += leap;
937
942 } 938 }
943 939
944 /* check to see if there is a new clocksource to use */ 940 /* check to see if there is a new clocksource to use */