diff options
author | john stultz <johnstul@us.ibm.com> | 2009-10-02 19:17:53 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-10-05 07:51:48 -0400 |
commit | a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 (patch) | |
tree | 79ec451d0bcdf6c08e0bc210b4beed694fbbf4a9 /kernel | |
parent | 8a0382f6fceaf0c6479e582e1054f36333ea3d24 (diff) |
time: Implement logarithmic time accumulation
Accumulating one tick at a time works well unless we're using NOHZ.
Then it can be an issue, since we may have to run through the loop
a few thousand times, which can increase timer interrupt caused
latency.
The current solution was to accumulate in half-second intervals
with NOHZ. This kept the number of loops down, however it did
slightly change how we make NTP adjustments. While not an issue
with NTPd users, as NTPd makes adjustments over a longer period of
time, other adjtimex() users have noticed the half-second
granularity with which we can apply frequency changes to the clock.
For instance, if a application tries to apply a 100ppm frequency
correction for 20ms to correct a 2us offset, with NOHZ they either
get no correction, or a 50us correction.
Now, there will always be some granularity error for applying
frequency corrections. However with users sensitive to this error
have seen a 50-500x increase with NOHZ compared to running without
NOHZ.
So I figured I'd try another approach then just simply increasing
the interval. My approach is to consume the time interval
logarithmically. This reduces the number of times through the loop
needed keeping latency down, while still preserving the original
granularity error for adjtimex() changes.
Further, this change allows us to remove the xtime_cache code
(patch to follow), as xtime is always within one tick of the
current time, instead of the half-second updates it saw before.
An earlier version of this patch has been shipping to x86 users in
the RedHat MRG releases for awhile without issue, but I've reworked
this version to be even more careful about avoiding possible
overflows if the shift value gets too large.
Signed-off-by: John Stultz <johnstul@us.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: John Kacur <jkacur@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/time/timekeeping.c | 85 |
1 files changed, 60 insertions, 25 deletions
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ec..5fdd78e0858 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset) | |||
721 | timekeeper.ntp_error_shift; | 721 | timekeeper.ntp_error_shift; |
722 | } | 722 | } |
723 | 723 | ||
724 | |||
725 | /** | ||
726 | * logarithmic_accumulation - shifted accumulation of cycles | ||
727 | * | ||
728 | * This functions accumulates a shifted interval of cycles into | ||
729 | * into a shifted interval nanoseconds. Allows for O(log) accumulation | ||
730 | * loop. | ||
731 | * | ||
732 | * Returns the unconsumed cycles. | ||
733 | */ | ||
734 | static cycle_t logarithmic_accumulation(cycle_t offset, int shift) | ||
735 | { | ||
736 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | ||
737 | |||
738 | /* If the offset is smaller then a shifted interval, do nothing */ | ||
739 | if (offset < timekeeper.cycle_interval<<shift) | ||
740 | return offset; | ||
741 | |||
742 | /* Accumulate one shifted interval */ | ||
743 | offset -= timekeeper.cycle_interval << shift; | ||
744 | timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift; | ||
745 | |||
746 | timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; | ||
747 | while (timekeeper.xtime_nsec >= nsecps) { | ||
748 | timekeeper.xtime_nsec -= nsecps; | ||
749 | xtime.tv_sec++; | ||
750 | second_overflow(); | ||
751 | } | ||
752 | |||
753 | /* Accumulate into raw time */ | ||
754 | raw_time.tv_nsec += timekeeper.raw_interval << shift;; | ||
755 | while (raw_time.tv_nsec >= NSEC_PER_SEC) { | ||
756 | raw_time.tv_nsec -= NSEC_PER_SEC; | ||
757 | raw_time.tv_sec++; | ||
758 | } | ||
759 | |||
760 | /* Accumulate error between NTP and clock interval */ | ||
761 | timekeeper.ntp_error += tick_length << shift; | ||
762 | timekeeper.ntp_error -= timekeeper.xtime_interval << | ||
763 | (timekeeper.ntp_error_shift + shift); | ||
764 | |||
765 | return offset; | ||
766 | } | ||
767 | |||
768 | |||
724 | /** | 769 | /** |
725 | * update_wall_time - Uses the current clocksource to increment the wall time | 770 | * update_wall_time - Uses the current clocksource to increment the wall time |
726 | * | 771 | * |
@@ -731,6 +776,7 @@ void update_wall_time(void) | |||
731 | struct clocksource *clock; | 776 | struct clocksource *clock; |
732 | cycle_t offset; | 777 | cycle_t offset; |
733 | u64 nsecs; | 778 | u64 nsecs; |
779 | int shift = 0, maxshift; | ||
734 | 780 | ||
735 | /* Make sure we're fully resumed: */ | 781 | /* Make sure we're fully resumed: */ |
736 | if (unlikely(timekeeping_suspended)) | 782 | if (unlikely(timekeeping_suspended)) |
@@ -744,33 +790,22 @@ void update_wall_time(void) | |||
744 | #endif | 790 | #endif |
745 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; | 791 | timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; |
746 | 792 | ||
747 | /* normally this loop will run just once, however in the | 793 | /* |
748 | * case of lost or late ticks, it will accumulate correctly. | 794 | * With NO_HZ we may have to accumulate many cycle_intervals |
795 | * (think "ticks") worth of time at once. To do this efficiently, | ||
796 | * we calculate the largest doubling multiple of cycle_intervals | ||
797 | * that is smaller then the offset. We then accumulate that | ||
798 | * chunk in one go, and then try to consume the next smaller | ||
799 | * doubled multiple. | ||
749 | */ | 800 | */ |
801 | shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); | ||
802 | shift = max(0, shift); | ||
803 | /* Bound shift to one less then what overflows tick_length */ | ||
804 | maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; | ||
805 | shift = min(shift, maxshift); | ||
750 | while (offset >= timekeeper.cycle_interval) { | 806 | while (offset >= timekeeper.cycle_interval) { |
751 | u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; | 807 | offset = logarithmic_accumulation(offset, shift); |
752 | 808 | shift--; | |
753 | /* accumulate one interval */ | ||
754 | offset -= timekeeper.cycle_interval; | ||
755 | clock->cycle_last += timekeeper.cycle_interval; | ||
756 | |||
757 | timekeeper.xtime_nsec += timekeeper.xtime_interval; | ||
758 | if (timekeeper.xtime_nsec >= nsecps) { | ||
759 | timekeeper.xtime_nsec -= nsecps; | ||
760 | xtime.tv_sec++; | ||
761 | second_overflow(); | ||
762 | } | ||
763 | |||
764 | raw_time.tv_nsec += timekeeper.raw_interval; | ||
765 | if (raw_time.tv_nsec >= NSEC_PER_SEC) { | ||
766 | raw_time.tv_nsec -= NSEC_PER_SEC; | ||
767 | raw_time.tv_sec++; | ||
768 | } | ||
769 | |||
770 | /* accumulate error between NTP and clock interval */ | ||
771 | timekeeper.ntp_error += tick_length; | ||
772 | timekeeper.ntp_error -= timekeeper.xtime_interval << | ||
773 | timekeeper.ntp_error_shift; | ||
774 | } | 809 | } |
775 | 810 | ||
776 | /* correct the clock when NTP error is too big */ | 811 | /* correct the clock when NTP error is too big */ |