diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-28 11:43:52 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-28 11:43:52 -0500 |
commit | c28800a9c3caaf387d85ac665a25ebe99e480295 (patch) | |
tree | 401ba805709a65a9f5c1919b3dc763c70b64b38f /kernel | |
parent | ce8f55c2a0ff652480c12a4f1f22ff5ce15e3a22 (diff) | |
parent | 27c9cd7e601632b3794e1c3344d37b86917ffb43 (diff) |
Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
hrtimer: Fix extra wakeups from __remove_hrtimer()
timekeeping: add arch_offset hook to ktime_get functions
clocksource: Avoid selecting mult values that might overflow when adjusted
time: Improve documentation of timekeeeping_adjust()
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/hrtimer.c | 6 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 58 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 92 |
3 files changed, 143 insertions, 13 deletions
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 422e567eecf6..ae34bf51682b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
885 | struct hrtimer_clock_base *base, | 885 | struct hrtimer_clock_base *base, |
886 | unsigned long newstate, int reprogram) | 886 | unsigned long newstate, int reprogram) |
887 | { | 887 | { |
888 | struct timerqueue_node *next_timer; | ||
888 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) | 889 | if (!(timer->state & HRTIMER_STATE_ENQUEUED)) |
889 | goto out; | 890 | goto out; |
890 | 891 | ||
891 | if (&timer->node == timerqueue_getnext(&base->active)) { | 892 | next_timer = timerqueue_getnext(&base->active); |
893 | timerqueue_del(&base->active, &timer->node); | ||
894 | if (&timer->node == next_timer) { | ||
892 | #ifdef CONFIG_HIGH_RES_TIMERS | 895 | #ifdef CONFIG_HIGH_RES_TIMERS |
893 | /* Reprogram the clock event device. if enabled */ | 896 | /* Reprogram the clock event device. if enabled */ |
894 | if (reprogram && hrtimer_hres_active()) { | 897 | if (reprogram && hrtimer_hres_active()) { |
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer, | |||
901 | } | 904 | } |
902 | #endif | 905 | #endif |
903 | } | 906 | } |
904 | timerqueue_del(&base->active, &timer->node); | ||
905 | if (!timerqueue_getnext(&base->active)) | 907 | if (!timerqueue_getnext(&base->active)) |
906 | base->cpu_base->active_bases &= ~(1 << base->index); | 908 | base->cpu_base->active_bases &= ~(1 << base->index); |
907 | out: | 909 | out: |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cf52fda2e096..cfc65e1eb9fb 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void) | |||
492 | } | 492 | } |
493 | 493 | ||
494 | /** | 494 | /** |
495 | * clocksource_max_adjustment- Returns max adjustment amount | ||
496 | * @cs: Pointer to clocksource | ||
497 | * | ||
498 | */ | ||
499 | static u32 clocksource_max_adjustment(struct clocksource *cs) | ||
500 | { | ||
501 | u64 ret; | ||
502 | /* | ||
503 | * We won't try to correct for more then 11% adjustments (110,000 ppm), | ||
504 | */ | ||
505 | ret = (u64)cs->mult * 11; | ||
506 | do_div(ret,100); | ||
507 | return (u32)ret; | ||
508 | } | ||
509 | |||
510 | /** | ||
495 | * clocksource_max_deferment - Returns max time the clocksource can be deferred | 511 | * clocksource_max_deferment - Returns max time the clocksource can be deferred |
496 | * @cs: Pointer to clocksource | 512 | * @cs: Pointer to clocksource |
497 | * | 513 | * |
@@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs) | |||
503 | /* | 519 | /* |
504 | * Calculate the maximum number of cycles that we can pass to the | 520 | * Calculate the maximum number of cycles that we can pass to the |
505 | * cyc2ns function without overflowing a 64-bit signed result. The | 521 | * cyc2ns function without overflowing a 64-bit signed result. The |
506 | * maximum number of cycles is equal to ULLONG_MAX/cs->mult which | 522 | * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) |
507 | * is equivalent to the below. | 523 | * which is equivalent to the below. |
508 | * max_cycles < (2^63)/cs->mult | 524 | * max_cycles < (2^63)/(cs->mult + cs->maxadj) |
509 | * max_cycles < 2^(log2((2^63)/cs->mult)) | 525 | * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) |
510 | * max_cycles < 2^(log2(2^63) - log2(cs->mult)) | 526 | * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) |
511 | * max_cycles < 2^(63 - log2(cs->mult)) | 527 | * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) |
512 | * max_cycles < 1 << (63 - log2(cs->mult)) | 528 | * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) |
513 | * Please note that we add 1 to the result of the log2 to account for | 529 | * Please note that we add 1 to the result of the log2 to account for |
514 | * any rounding errors, ensure the above inequality is satisfied and | 530 | * any rounding errors, ensure the above inequality is satisfied and |
515 | * no overflow will occur. | 531 | * no overflow will occur. |
516 | */ | 532 | */ |
517 | max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); | 533 | max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); |
518 | 534 | ||
519 | /* | 535 | /* |
520 | * The actual maximum number of cycles we can defer the clocksource is | 536 | * The actual maximum number of cycles we can defer the clocksource is |
521 | * determined by the minimum of max_cycles and cs->mask. | 537 | * determined by the minimum of max_cycles and cs->mask. |
538 | * Note: Here we subtract the maxadj to make sure we don't sleep for | ||
539 | * too long if there's a large negative adjustment. | ||
522 | */ | 540 | */ |
523 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); | 541 | max_cycles = min_t(u64, max_cycles, (u64) cs->mask); |
524 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); | 542 | max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, |
543 | cs->shift); | ||
525 | 544 | ||
526 | /* | 545 | /* |
527 | * To ensure that the clocksource does not wrap whilst we are idle, | 546 | * To ensure that the clocksource does not wrap whilst we are idle, |
@@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs) | |||
640 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | 659 | void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) |
641 | { | 660 | { |
642 | u64 sec; | 661 | u64 sec; |
643 | |||
644 | /* | 662 | /* |
645 | * Calc the maximum number of seconds which we can run before | 663 | * Calc the maximum number of seconds which we can run before |
646 | * wrapping around. For clocksources which have a mask > 32bit | 664 | * wrapping around. For clocksources which have a mask > 32bit |
@@ -661,6 +679,20 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) | |||
661 | 679 | ||
662 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, | 680 | clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, |
663 | NSEC_PER_SEC / scale, sec * scale); | 681 | NSEC_PER_SEC / scale, sec * scale); |
682 | |||
683 | /* | ||
684 | * for clocksources that have large mults, to avoid overflow. | ||
685 | * Since mult may be adjusted by ntp, add an safety extra margin | ||
686 | * | ||
687 | */ | ||
688 | cs->maxadj = clocksource_max_adjustment(cs); | ||
689 | while ((cs->mult + cs->maxadj < cs->mult) | ||
690 | || (cs->mult - cs->maxadj > cs->mult)) { | ||
691 | cs->mult >>= 1; | ||
692 | cs->shift--; | ||
693 | cs->maxadj = clocksource_max_adjustment(cs); | ||
694 | } | ||
695 | |||
664 | cs->max_idle_ns = clocksource_max_deferment(cs); | 696 | cs->max_idle_ns = clocksource_max_deferment(cs); |
665 | } | 697 | } |
666 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); | 698 | EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); |
@@ -701,6 +733,12 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); | |||
701 | */ | 733 | */ |
702 | int clocksource_register(struct clocksource *cs) | 734 | int clocksource_register(struct clocksource *cs) |
703 | { | 735 | { |
736 | /* calculate max adjustment for given mult/shift */ | ||
737 | cs->maxadj = clocksource_max_adjustment(cs); | ||
738 | WARN_ONCE(cs->mult + cs->maxadj < cs->mult, | ||
739 | "Clocksource %s might overflow on 11%% adjustment\n", | ||
740 | cs->name); | ||
741 | |||
704 | /* calculate max idle time permitted for this clocksource */ | 742 | /* calculate max idle time permitted for this clocksource */ |
705 | cs->max_idle_ns = clocksource_max_deferment(cs); | 743 | cs->max_idle_ns = clocksource_max_deferment(cs); |
706 | 744 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..237841378c03 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -249,6 +249,8 @@ ktime_t ktime_get(void) | |||
249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; | 249 | secs = xtime.tv_sec + wall_to_monotonic.tv_sec; |
250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; | 250 | nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; |
251 | nsecs += timekeeping_get_ns(); | 251 | nsecs += timekeeping_get_ns(); |
252 | /* If arch requires, add in gettimeoffset() */ | ||
253 | nsecs += arch_gettimeoffset(); | ||
252 | 254 | ||
253 | } while (read_seqretry(&xtime_lock, seq)); | 255 | } while (read_seqretry(&xtime_lock, seq)); |
254 | /* | 256 | /* |
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts) | |||
280 | *ts = xtime; | 282 | *ts = xtime; |
281 | tomono = wall_to_monotonic; | 283 | tomono = wall_to_monotonic; |
282 | nsecs = timekeeping_get_ns(); | 284 | nsecs = timekeeping_get_ns(); |
285 | /* If arch requires, add in gettimeoffset() */ | ||
286 | nsecs += arch_gettimeoffset(); | ||
283 | 287 | ||
284 | } while (read_seqretry(&xtime_lock, seq)); | 288 | } while (read_seqretry(&xtime_lock, seq)); |
285 | 289 | ||
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset) | |||
802 | s64 error, interval = timekeeper.cycle_interval; | 806 | s64 error, interval = timekeeper.cycle_interval; |
803 | int adj; | 807 | int adj; |
804 | 808 | ||
809 | /* | ||
810 | * The point of this is to check if the error is greater then half | ||
811 | * an interval. | ||
812 | * | ||
813 | * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. | ||
814 | * | ||
815 | * Note we subtract one in the shift, so that error is really error*2. | ||
816 | * This "saves" dividing(shifting) intererval twice, but keeps the | ||
817 | * (error > interval) comparision as still measuring if error is | ||
818 | * larger then half an interval. | ||
819 | * | ||
820 | * Note: It does not "save" on aggrivation when reading the code. | ||
821 | */ | ||
805 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); | 822 | error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); |
806 | if (error > interval) { | 823 | if (error > interval) { |
824 | /* | ||
825 | * We now divide error by 4(via shift), which checks if | ||
826 | * the error is greater then twice the interval. | ||
827 | * If it is greater, we need a bigadjust, if its smaller, | ||
828 | * we can adjust by 1. | ||
829 | */ | ||
807 | error >>= 2; | 830 | error >>= 2; |
831 | /* | ||
832 | * XXX - In update_wall_time, we round up to the next | ||
833 | * nanosecond, and store the amount rounded up into | ||
834 | * the error. This causes the likely below to be unlikely. | ||
835 | * | ||
836 | * The properfix is to avoid rounding up by using | ||
837 | * the high precision timekeeper.xtime_nsec instead of | ||
838 | * xtime.tv_nsec everywhere. Fixing this will take some | ||
839 | * time. | ||
840 | */ | ||
808 | if (likely(error <= interval)) | 841 | if (likely(error <= interval)) |
809 | adj = 1; | 842 | adj = 1; |
810 | else | 843 | else |
811 | adj = timekeeping_bigadjust(error, &interval, &offset); | 844 | adj = timekeeping_bigadjust(error, &interval, &offset); |
812 | } else if (error < -interval) { | 845 | } else if (error < -interval) { |
846 | /* See comment above, this is just switched for the negative */ | ||
813 | error >>= 2; | 847 | error >>= 2; |
814 | if (likely(error >= -interval)) { | 848 | if (likely(error >= -interval)) { |
815 | adj = -1; | 849 | adj = -1; |
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset) | |||
817 | offset = -offset; | 851 | offset = -offset; |
818 | } else | 852 | } else |
819 | adj = timekeeping_bigadjust(error, &interval, &offset); | 853 | adj = timekeeping_bigadjust(error, &interval, &offset); |
820 | } else | 854 | } else /* No adjustment needed */ |
821 | return; | 855 | return; |
822 | 856 | ||
857 | WARN_ONCE(timekeeper.clock->maxadj && | ||
858 | (timekeeper.mult + adj > timekeeper.clock->mult + | ||
859 | timekeeper.clock->maxadj), | ||
860 | "Adjusting %s more then 11%% (%ld vs %ld)\n", | ||
861 | timekeeper.clock->name, (long)timekeeper.mult + adj, | ||
862 | (long)timekeeper.clock->mult + | ||
863 | timekeeper.clock->maxadj); | ||
864 | /* | ||
865 | * So the following can be confusing. | ||
866 | * | ||
867 | * To keep things simple, lets assume adj == 1 for now. | ||
868 | * | ||
869 | * When adj != 1, remember that the interval and offset values | ||
870 | * have been appropriately scaled so the math is the same. | ||
871 | * | ||
872 | * The basic idea here is that we're increasing the multiplier | ||
873 | * by one, this causes the xtime_interval to be incremented by | ||
874 | * one cycle_interval. This is because: | ||
875 | * xtime_interval = cycle_interval * mult | ||
876 | * So if mult is being incremented by one: | ||
877 | * xtime_interval = cycle_interval * (mult + 1) | ||
878 | * Its the same as: | ||
879 | * xtime_interval = (cycle_interval * mult) + cycle_interval | ||
880 | * Which can be shortened to: | ||
881 | * xtime_interval += cycle_interval | ||
882 | * | ||
883 | * So offset stores the non-accumulated cycles. Thus the current | ||
884 | * time (in shifted nanoseconds) is: | ||
885 | * now = (offset * adj) + xtime_nsec | ||
886 | * Now, even though we're adjusting the clock frequency, we have | ||
887 | * to keep time consistent. In other words, we can't jump back | ||
888 | * in time, and we also want to avoid jumping forward in time. | ||
889 | * | ||
890 | * So given the same offset value, we need the time to be the same | ||
891 | * both before and after the freq adjustment. | ||
892 | * now = (offset * adj_1) + xtime_nsec_1 | ||
893 | * now = (offset * adj_2) + xtime_nsec_2 | ||
894 | * So: | ||
895 | * (offset * adj_1) + xtime_nsec_1 = | ||
896 | * (offset * adj_2) + xtime_nsec_2 | ||
897 | * And we know: | ||
898 | * adj_2 = adj_1 + 1 | ||
899 | * So: | ||
900 | * (offset * adj_1) + xtime_nsec_1 = | ||
901 | * (offset * (adj_1+1)) + xtime_nsec_2 | ||
902 | * (offset * adj_1) + xtime_nsec_1 = | ||
903 | * (offset * adj_1) + offset + xtime_nsec_2 | ||
904 | * Canceling the sides: | ||
905 | * xtime_nsec_1 = offset + xtime_nsec_2 | ||
906 | * Which gives us: | ||
907 | * xtime_nsec_2 = xtime_nsec_1 - offset | ||
908 | * Which simplfies to: | ||
909 | * xtime_nsec -= offset | ||
910 | * | ||
911 | * XXX - TODO: Doc ntp_error calculation. | ||
912 | */ | ||
823 | timekeeper.mult += adj; | 913 | timekeeper.mult += adj; |
824 | timekeeper.xtime_interval += interval; | 914 | timekeeper.xtime_interval += interval; |
825 | timekeeper.xtime_nsec -= offset; | 915 | timekeeper.xtime_nsec -= offset; |