aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2008-04-01 13:45:18 -0400
committerIngo Molnar <mingo@elte.hu>2008-04-19 13:19:55 -0400
commitd8bb6f4c1670c8324e4135c61ef07486f7f17379 (patch)
treed53b676621b3bd6bb4d39a4b22588b58e1a7ea45
parentf1326973262382150c26bf4dfccd0fce310c4a9c (diff)
x86: tsc prevent time going backwards
We already catch most of the TSC problems by sanity checks, but there is a subtle bug which has been in the code forever. This can cause time jumps in the range of hours. This was reported in: http://lkml.org/lkml/2007/8/23/96 and http://lkml.org/lkml/2008/3/31/23 I was able to reproduce the problem with a gettimeofday loop test on a dual core and a quad core machine which both have sychronized TSCs. The TSCs seems not to be perfectly in sync though, but the kernel is not able to detect the slight delta in the sync check. Still there exists an extremly small window where this delta can be observed with a real big time jump. So far I was only able to reproduce this with the vsyscall gettimeofday implementation, but in theory this might be observable with the syscall based version as well. CPU 0 updates the clock source variables under xtime/vyscall lock and CPU1, where the TSC is slighty behind CPU0, is reading the time right after the seqlock was unlocked. The clocksource reference data was updated with the TSC from CPU0 and the value which is read from TSC on CPU1 is less than the reference data. This results in a huge delta value due to the unsigned subtraction of the TSC value and the reference value. This algorithm can not be changed due to the support of wrapping clock sources like pm timer. The huge delta is converted to nanoseconds and added to xtime, which is then observable by the caller. The next gettimeofday call on CPU1 will show the correct time again as now the TSC has advanced above the reference value. To prevent this TSC specific wreckage we need to compare the TSC value against the reference value and return the latter when it is larger than the actual TSC value. I pondered to mark the TSC unstable when the readout is smaller than the reference value, but this would render an otherwise good and fast clocksource unusable without a real good reason. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--arch/x86/kernel/tsc_32.c15
-rw-r--r--arch/x86/kernel/tsc_64.c23
-rw-r--r--kernel/time/timekeeping.c2
3 files changed, 36 insertions, 4 deletions
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index 06af8cf8251f..e4790728b224 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -284,14 +284,27 @@ core_initcall(cpufreq_tsc);
284/* clock source code */ 284/* clock source code */
285 285
286static unsigned long current_tsc_khz; 286static unsigned long current_tsc_khz;
287static struct clocksource clocksource_tsc;
287 288
289/*
290 * We compare the TSC to the cycle_last value in the clocksource
291 * structure to avoid a nasty time-warp issue. This can be observed in
292 * a very small window right after one CPU updated cycle_last under
293 * xtime lock and the other CPU reads a TSC value which is smaller
294 * than the cycle_last reference value due to a TSC which is slighty
295 * behind. This delta is nowhere else observable, but in that case it
296 * results in a forward time jump in the range of hours due to the
297 * unsigned delta calculation of the time keeping core code, which is
298 * necessary to support wrapping clocksources like pm timer.
299 */
288static cycle_t read_tsc(void) 300static cycle_t read_tsc(void)
289{ 301{
290 cycle_t ret; 302 cycle_t ret;
291 303
292 rdtscll(ret); 304 rdtscll(ret);
293 305
294 return ret; 306 return ret >= clocksource_tsc.cycle_last ?
307 ret : clocksource_tsc.cycle_last;
295} 308}
296 309
297static struct clocksource clocksource_tsc = { 310static struct clocksource clocksource_tsc = {
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index ceeba01e7f47..fcc16e58609e 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -11,6 +11,7 @@
11#include <asm/hpet.h> 11#include <asm/hpet.h>
12#include <asm/timex.h> 12#include <asm/timex.h>
13#include <asm/timer.h> 13#include <asm/timer.h>
14#include <asm/vgtod.h>
14 15
15static int notsc __initdata = 0; 16static int notsc __initdata = 0;
16 17
@@ -287,18 +288,34 @@ int __init notsc_setup(char *s)
287 288
288__setup("notsc", notsc_setup); 289__setup("notsc", notsc_setup);
289 290
291static struct clocksource clocksource_tsc;
290 292
291/* clock source code: */ 293/*
294 * We compare the TSC to the cycle_last value in the clocksource
295 * structure to avoid a nasty time-warp. This can be observed in a
296 * very small window right after one CPU updated cycle_last under
297 * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
298 * is smaller than the cycle_last reference value due to a TSC which
299 * is slighty behind. This delta is nowhere else observable, but in
300 * that case it results in a forward time jump in the range of hours
301 * due to the unsigned delta calculation of the time keeping core
302 * code, which is necessary to support wrapping clocksources like pm
303 * timer.
304 */
292static cycle_t read_tsc(void) 305static cycle_t read_tsc(void)
293{ 306{
294 cycle_t ret = (cycle_t)get_cycles(); 307 cycle_t ret = (cycle_t)get_cycles();
295 return ret; 308
309 return ret >= clocksource_tsc.cycle_last ?
310 ret : clocksource_tsc.cycle_last;
296} 311}
297 312
298static cycle_t __vsyscall_fn vread_tsc(void) 313static cycle_t __vsyscall_fn vread_tsc(void)
299{ 314{
300 cycle_t ret = (cycle_t)vget_cycles(); 315 cycle_t ret = (cycle_t)vget_cycles();
301 return ret; 316
317 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
318 ret : __vsyscall_gtod_data.clock.cycle_last;
302} 319}
303 320
304static struct clocksource clocksource_tsc = { 321static struct clocksource clocksource_tsc = {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a3fa587c350c..2d6087c7cf98 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -178,6 +178,7 @@ static void change_clocksource(void)
178 if (clock == new) 178 if (clock == new)
179 return; 179 return;
180 180
181 new->cycle_last = 0;
181 now = clocksource_read(new); 182 now = clocksource_read(new);
182 nsec = __get_nsec_offset(); 183 nsec = __get_nsec_offset();
183 timespec_add_ns(&xtime, nsec); 184 timespec_add_ns(&xtime, nsec);
@@ -295,6 +296,7 @@ static int timekeeping_resume(struct sys_device *dev)
295 timespec_add_ns(&xtime, timekeeping_suspend_nsecs); 296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
296 update_xtime_cache(0); 297 update_xtime_cache(0);
297 /* re-base the last cycle value */ 298 /* re-base the last cycle value */
299 clock->cycle_last = 0;
298 clock->cycle_last = clocksource_read(clock); 300 clock->cycle_last = clocksource_read(clock);
299 clock->error = 0; 301 clock->error = 0;
300 timekeeping_suspended = 0; 302 timekeeping_suspended = 0;