time: Implement logarithmic time accumulation

Accumulating one tick at a time works well unless we're using NOHZ. Then it can be an issue, since we may have to run through the loop a few thousand times, which can increase timer interrupt caused latency. The current solution was to accumulate in half-second intervals with NOHZ. This kept the number of loops down, however it did slightly change how we make NTP adjustments. While not an issue with NTPd users, as NTPd makes adjustments over a longer period of time, other adjtimex() users have noticed the half-second granularity with which we can apply frequency changes to the clock. For instance, if a application tries to apply a 100ppm frequency correction for 20ms to correct a 2us offset, with NOHZ they either get no correction, or a 50us correction. Now, there will always be some granularity error for applying frequency corrections. However with users sensitive to this error have seen a 50-500x increase with NOHZ compared to running without NOHZ. So I figured I'd try another approach then just simply increasing the interval. My approach is to consume the time interval logarithmically. This reduces the number of times through the loop needed keeping latency down, while still preserving the original granularity error for adjtimex() changes. Further, this change allows us to remove the xtime_cache code (patch to follow), as xtime is always within one tick of the current time, instead of the half-second updates it saw before. An earlier version of this patch has been shipping to x86 users in the RedHat MRG releases for awhile without issue, but I've reworked this version to be even more careful about avoiding possible overflows if the shift value gets too large. Signed-off-by: John Stultz <johnstul@us.ibm.com> Acked-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: John Kacur <jkacur@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Andrew Morton <akpm@linux-foundation.org> LKML-Reference: <1254525473.7741.88.camel@localhost.localdomain> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: john stultz <johnstul@us.ibm.com> 2009-10-02 19:17:53 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-10-05 07:51:48 -0400
commit: a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 (patch)
tree: 79ec451d0bcdf6c08e0bc210b4beed694fbbf4a9
parent: 8a0382f6fceaf0c6479e582e1054f36333ea3d24 (diff)
2 files changed, 60 insertions, 29 deletions
diff --git a/include/linux/timex.h b/include/linux/timex.h
index e6967d10d9e5..0c0ef7d4db7c 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
 #define NTP_SCALE_SHIFT         32
-#ifdef CONFIG_NO_HZ
-#define NTP_INTERVAL_FREQ  (2)
-#else
 #define NTP_INTERVAL_FREQ  (HZ)
-#endif
 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ecd..5fdd78e0858a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
                                timekeeper.ntp_error_shift;
 }
+/**
+ * logarithmic_accumulation - shifted accumulation of cycles
+ *
+ * This functions accumulates a shifted interval of cycles into
+ * into a shifted interval nanoseconds. Allows for O(log) accumulation
+ * loop.
+ *
+ * Returns the unconsumed cycles.
+ */
+static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
+{
+        u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+        /* If the offset is smaller then a shifted interval, do nothing */
+        if (offset < timekeeper.cycle_interval<<shift)
+                return offset;
+        /* Accumulate one shifted interval */
+        offset -= timekeeper.cycle_interval << shift;
+        timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
+        timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
+        while (timekeeper.xtime_nsec >= nsecps) {
+                timekeeper.xtime_nsec -= nsecps;
+                xtime.tv_sec++;
+                second_overflow();
+        }
+        /* Accumulate into raw time */
+        raw_time.tv_nsec += timekeeper.raw_interval << shift;;
+        while (raw_time.tv_nsec >= NSEC_PER_SEC) {
+                raw_time.tv_nsec -= NSEC_PER_SEC;
+                raw_time.tv_sec++;
+        }
+        /* Accumulate error between NTP and clock interval */
+        timekeeper.ntp_error += tick_length << shift;
+        timekeeper.ntp_error -= timekeeper.xtime_interval <<
+                                (timekeeper.ntp_error_shift + shift);
+        return offset;
+}
 /**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
@@ -731,6 +776,7 @@ void update_wall_time(void)
        struct clocksource *clock;
        cycle_t offset;
        u64 nsecs;
+        int shift = 0, maxshift;
        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
 #endif
        timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
-        /* normally this loop will run just once, however in the
+        /*
-         * case of lost or late ticks, it will accumulate correctly.
+         * With NO_HZ we may have to accumulate many cycle_intervals
+         * (think "ticks") worth of time at once. To do this efficiently,
+         * we calculate the largest doubling multiple of cycle_intervals
+         * that is smaller then the offset. We then accumulate that
+         * chunk in one go, and then try to consume the next smaller
+         * doubled multiple.
         */
+        shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+        shift = max(0, shift);
+        /* Bound shift to one less then what overflows tick_length */
+        maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
+        shift = min(shift, maxshift);
        while (offset >= timekeeper.cycle_interval) {
-                u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+                offset = logarithmic_accumulation(offset, shift);
+                shift--;
-                /* accumulate one interval */
-                offset -= timekeeper.cycle_interval;
-                clock->cycle_last += timekeeper.cycle_interval;
-                timekeeper.xtime_nsec += timekeeper.xtime_interval;
-                if (timekeeper.xtime_nsec >= nsecps) {
-                        timekeeper.xtime_nsec -= nsecps;
-                        xtime.tv_sec++;
-                        second_overflow();
-                }
-                raw_time.tv_nsec += timekeeper.raw_interval;
-                if (raw_time.tv_nsec >= NSEC_PER_SEC) {
-                        raw_time.tv_nsec -= NSEC_PER_SEC;
-                        raw_time.tv_sec++;
-                }
-                /* accumulate error between NTP and clock interval */
-                timekeeper.ntp_error += tick_length;
-                timekeeper.ntp_error -= timekeeper.xtime_interval <<
-                                        timekeeper.ntp_error_shift;
        }
        /* correct the clock when NTP error is too big */
author	john stultz <johnstul@us.ibm.com>	2009-10-02 19:17:53 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-10-05 07:51:48 -0400
commit	a092ff0f90cae22b2ac8028ecd2c6f6c1a9e4601 (patch)
tree	79ec451d0bcdf6c08e0bc210b4beed694fbbf4a9
parent	8a0382f6fceaf0c6479e582e1054f36333ea3d24 (diff)

diff --git a/include/linux/timex.h b/include/linux/timex.h index e6967d10d9e5..0c0ef7d4db7c 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h
@@ -261,11 +261,7 @@ static inline int ntp_synced(void)
261		261
262	#define NTP_SCALE_SHIFT 32	262	#define NTP_SCALE_SHIFT 32
263		263
264	#ifdef CONFIG_NO_HZ
265	#define NTP_INTERVAL_FREQ (2)
266	#else
267	#define NTP_INTERVAL_FREQ (HZ)	264	#define NTP_INTERVAL_FREQ (HZ)
268	#endif
269	#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)	265	#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
270		266
271	/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */	267	/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */


diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..5fdd78e0858a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c
@@ -721,6 +721,51 @@ static void timekeeping_adjust(s64 offset)
721	timekeeper.ntp_error_shift;	721	timekeeper.ntp_error_shift;
722	}	722	}
723		723
		724
		725	/**
		726	* logarithmic_accumulation - shifted accumulation of cycles
		727	*
		728	* This functions accumulates a shifted interval of cycles into
		729	* into a shifted interval nanoseconds. Allows for O(log) accumulation
		730	* loop.
		731	*
		732	* Returns the unconsumed cycles.
		733	*/
		734	static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
		735	{
		736	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
		737
		738	/* If the offset is smaller then a shifted interval, do nothing */
		739	if (offset < timekeeper.cycle_interval<<shift)
		740	return offset;
		741
		742	/* Accumulate one shifted interval */
		743	offset -= timekeeper.cycle_interval << shift;
		744	timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
		745
		746	timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
		747	while (timekeeper.xtime_nsec >= nsecps) {
		748	timekeeper.xtime_nsec -= nsecps;
		749	xtime.tv_sec++;
		750	second_overflow();
		751	}
		752
		753	/* Accumulate into raw time */
		754	raw_time.tv_nsec += timekeeper.raw_interval << shift;;
		755	while (raw_time.tv_nsec >= NSEC_PER_SEC) {
		756	raw_time.tv_nsec -= NSEC_PER_SEC;
		757	raw_time.tv_sec++;
		758	}
		759
		760	/* Accumulate error between NTP and clock interval */
		761	timekeeper.ntp_error += tick_length << shift;
		762	timekeeper.ntp_error -= timekeeper.xtime_interval <<
		763	(timekeeper.ntp_error_shift + shift);
		764
		765	return offset;
		766	}
		767
		768
724	/**	769	/**
725	* update_wall_time - Uses the current clocksource to increment the wall time	770	* update_wall_time - Uses the current clocksource to increment the wall time
726	*	771	*
@@ -731,6 +776,7 @@ void update_wall_time(void)
731	struct clocksource *clock;	776	struct clocksource *clock;
732	cycle_t offset;	777	cycle_t offset;
733	u64 nsecs;	778	u64 nsecs;
		779	int shift = 0, maxshift;
734		780
735	/* Make sure we're fully resumed: */	781	/* Make sure we're fully resumed: */
736	if (unlikely(timekeeping_suspended))	782	if (unlikely(timekeeping_suspended))
@@ -744,33 +790,22 @@ void update_wall_time(void)
744	#endif	790	#endif
745	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;	791	timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
746		792
747	/* normally this loop will run just once, however in the	793	/*
748	* case of lost or late ticks, it will accumulate correctly.	794	* With NO_HZ we may have to accumulate many cycle_intervals
		795	* (think "ticks") worth of time at once. To do this efficiently,
		796	* we calculate the largest doubling multiple of cycle_intervals
		797	* that is smaller then the offset. We then accumulate that
		798	* chunk in one go, and then try to consume the next smaller
		799	* doubled multiple.
749	*/	800	*/
		801	shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
		802	shift = max(0, shift);
		803	/* Bound shift to one less then what overflows tick_length */
		804	maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
		805	shift = min(shift, maxshift);
750	while (offset >= timekeeper.cycle_interval) {	806	while (offset >= timekeeper.cycle_interval) {
751	u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;	807	offset = logarithmic_accumulation(offset, shift);
752		808	shift--;
753	/* accumulate one interval */
754	offset -= timekeeper.cycle_interval;
755	clock->cycle_last += timekeeper.cycle_interval;
756
757	timekeeper.xtime_nsec += timekeeper.xtime_interval;
758	if (timekeeper.xtime_nsec >= nsecps) {
759	timekeeper.xtime_nsec -= nsecps;
760	xtime.tv_sec++;
761	second_overflow();
762	}
763
764	raw_time.tv_nsec += timekeeper.raw_interval;
765	if (raw_time.tv_nsec >= NSEC_PER_SEC) {
766	raw_time.tv_nsec -= NSEC_PER_SEC;
767	raw_time.tv_sec++;
768	}
769
770	/* accumulate error between NTP and clock interval */
771	timekeeper.ntp_error += tick_length;
772	timekeeper.ntp_error -= timekeeper.xtime_interval <<
773	timekeeper.ntp_error_shift;
774	}	809	}
775		810
776	/* correct the clock when NTP error is too big */	811	/* correct the clock when NTP error is too big */