diff options
author | Paul Mackerras <paulus@samba.org> | 2010-06-20 15:03:08 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2010-07-08 21:26:16 -0400 |
commit | 8fd63a9ea7528463211a6c88d500c51851d960c8 (patch) | |
tree | a24f11824e6c31ebd632ff5bcfb27a6e45713f7c /arch/powerpc/kernel/vdso32 | |
parent | 5f07aa7524e98d6f68f2bec54f155ef6012e2c9a (diff) |
powerpc: Rework VDSO gettimeofday to prevent time going backwards
Currently it is possible for userspace to see the result of
gettimeofday() going backwards by 1 microsecond, assuming that
userspace is using the gettimeofday() in the VDSO. The VDSO
gettimeofday() algorithm computes the time in "xsecs", which are
units of 2^-20 seconds, or approximately 0.954 microseconds,
using the algorithm
now = (timebase - tb_orig_stamp) * tb_to_xs + stamp_xsec
and then converts the time in xsecs to seconds and microseconds.
The kernel updates the tb_orig_stamp and stamp_xsec values every
tick in update_vsyscall(). If the length of the tick is not an
integer number of xsecs, then some precision is lost in converting
the current time to xsecs. For example, with CONFIG_HZ=1000, the
tick is 1ms long, which is 1048.576 xsecs. That means that
stamp_xsec will advance by either 1048 or 1049 on each tick.
With the right conditions, it is possible for userspace to get
(timebase - tb_orig_stamp) * tb_to_xs being 1049 if the kernel is
slightly late in updating the vdso_datapage, and then for stamp_xsec
to advance by 1048 when the kernel does update it, and for userspace
to then see (timebase - tb_orig_stamp) * tb_to_xs being zero due to
integer truncation. The result is that time appears to go backwards
by 1 microsecond.
To fix this we change the VDSO gettimeofday to use a new field in the
VDSO datapage which stores the nanoseconds part of the time as a
fractional number of seconds in a 0.32 binary fraction format.
(Or put another way, as a 32-bit number in units of 0.23283 ns.)
This is convenient because we can use the mulhwu instruction to
convert it to either microseconds or nanoseconds.
Since it turns out that computing the time of day using this new field
is simpler than either using stamp_xsec (as gettimeofday does) or
stamp_xtime.tv_nsec (as clock_gettime does), this converts both
gettimeofday and clock_gettime to use the new field. The existing
__do_get_tspec function is converted to use the new field and take
a parameter in r7 that indicates the desired resolution, 1,000,000
for microseconds or 1,000,000,000 for nanoseconds. The __do_get_xsec
function is then unused and is deleted.
The new algorithm is
now = ((timebase - tb_orig_stamp) << 12) * tb_to_xs
+ (stamp_xtime_seconds << 32) + stamp_sec_fraction
with 'now' in units of 2^-32 seconds. That is then converted to
seconds and either microseconds or nanoseconds with
seconds = now >> 32
partseconds = ((now & 0xffffffff) * resolution) >> 32
The 32-bit VDSO code also makes a further simplification: it ignores
the bottom 32 bits of the tb_to_xs value, which is a 0.64 format binary
fraction. Doing so gets rid of 4 multiply instructions. Assuming
a timebase frequency of 1GHz or less and an update interval of no
more than 10ms, the upper 32 bits of tb_to_xs will be at least
4503599, so the error from ignoring the low 32 bits will be at most
2.2ns, which is more than an order of magnitude less than the time
taken to do gettimeofday or clock_gettime on our fastest processors,
so there is no possibility of seeing inconsistent values due to this.
This also moves update_gtod() down next to its only caller, and makes
update_vsyscall use the time passed in via the wall_time argument rather
than accessing xtime directly. At present, wall_time always points to
xtime, but that could change in future.
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/kernel/vdso32')
-rw-r--r-- | arch/powerpc/kernel/vdso32/gettimeofday.S | 184 |
1 files changed, 42 insertions, 142 deletions
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index ee038d4bf252..4ee09ee2e836 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S | |||
@@ -19,8 +19,10 @@ | |||
19 | /* Offset for the low 32-bit part of a field of long type */ | 19 | /* Offset for the low 32-bit part of a field of long type */ |
20 | #ifdef CONFIG_PPC64 | 20 | #ifdef CONFIG_PPC64 |
21 | #define LOPART 4 | 21 | #define LOPART 4 |
22 | #define TSPEC_TV_SEC TSPC64_TV_SEC+LOPART | ||
22 | #else | 23 | #else |
23 | #define LOPART 0 | 24 | #define LOPART 0 |
25 | #define TSPEC_TV_SEC TSPC32_TV_SEC | ||
24 | #endif | 26 | #endif |
25 | 27 | ||
26 | .text | 28 | .text |
@@ -41,23 +43,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) | |||
41 | mr r9, r3 /* datapage ptr in r9 */ | 43 | mr r9, r3 /* datapage ptr in r9 */ |
42 | cmplwi r10,0 /* check if tv is NULL */ | 44 | cmplwi r10,0 /* check if tv is NULL */ |
43 | beq 3f | 45 | beq 3f |
44 | bl __do_get_xsec@local /* get xsec from tb & kernel */ | 46 | lis r7,1000000@ha /* load up USEC_PER_SEC */ |
45 | bne- 2f /* out of line -> do syscall */ | 47 | addi r7,r7,1000000@l /* so we get microseconds in r4 */ |
46 | 48 | bl __do_get_tspec@local /* get sec/usec from tb & kernel */ | |
47 | /* seconds are xsec >> 20 */ | 49 | stw r3,TVAL32_TV_SEC(r10) |
48 | rlwinm r5,r4,12,20,31 | 50 | stw r4,TVAL32_TV_USEC(r10) |
49 | rlwimi r5,r3,12,0,19 | ||
50 | stw r5,TVAL32_TV_SEC(r10) | ||
51 | |||
52 | /* get remaining xsec and convert to usec. we scale | ||
53 | * up remaining xsec by 12 bits and get the top 32 bits | ||
54 | * of the multiplication | ||
55 | */ | ||
56 | rlwinm r5,r4,12,0,19 | ||
57 | lis r6,1000000@h | ||
58 | ori r6,r6,1000000@l | ||
59 | mulhwu r5,r5,r6 | ||
60 | stw r5,TVAL32_TV_USEC(r10) | ||
61 | 51 | ||
62 | 3: cmplwi r11,0 /* check if tz is NULL */ | 52 | 3: cmplwi r11,0 /* check if tz is NULL */ |
63 | beq 1f | 53 | beq 1f |
@@ -70,14 +60,6 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) | |||
70 | crclr cr0*4+so | 60 | crclr cr0*4+so |
71 | li r3,0 | 61 | li r3,0 |
72 | blr | 62 | blr |
73 | |||
74 | 2: | ||
75 | mtlr r12 | ||
76 | mr r3,r10 | ||
77 | mr r4,r11 | ||
78 | li r0,__NR_gettimeofday | ||
79 | sc | ||
80 | blr | ||
81 | .cfi_endproc | 63 | .cfi_endproc |
82 | V_FUNCTION_END(__kernel_gettimeofday) | 64 | V_FUNCTION_END(__kernel_gettimeofday) |
83 | 65 | ||
@@ -100,7 +82,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) | |||
100 | mr r11,r4 /* r11 saves tp */ | 82 | mr r11,r4 /* r11 saves tp */ |
101 | bl __get_datapage@local /* get data page */ | 83 | bl __get_datapage@local /* get data page */ |
102 | mr r9,r3 /* datapage ptr in r9 */ | 84 | mr r9,r3 /* datapage ptr in r9 */ |
103 | 85 | lis r7,NSEC_PER_SEC@h /* want nanoseconds */ | |
86 | ori r7,r7,NSEC_PER_SEC@l | ||
104 | 50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ | 87 | 50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ |
105 | bne cr1,80f /* not monotonic -> all done */ | 88 | bne cr1,80f /* not monotonic -> all done */ |
106 | 89 | ||
@@ -198,83 +181,12 @@ V_FUNCTION_END(__kernel_clock_getres) | |||
198 | 181 | ||
199 | 182 | ||
200 | /* | 183 | /* |
201 | * This is the core of gettimeofday() & friends, it returns the xsec | 184 | * This is the core of clock_gettime() and gettimeofday(), |
202 | * value in r3 & r4 and expects the datapage ptr (non clobbered) | 185 | * it returns the current time in r3 (seconds) and r4. |
203 | * in r9. clobbers r0,r4,r5,r6,r7,r8. | 186 | * On entry, r7 gives the resolution of r4, either USEC_PER_SEC |
204 | * When returning, r8 contains the counter value that can be reused | 187 | * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds. |
205 | * by the monotonic clock implementation | ||
206 | */ | ||
207 | __do_get_xsec: | ||
208 | .cfi_startproc | ||
209 | /* Check for update count & load values. We use the low | ||
210 | * order 32 bits of the update count | ||
211 | */ | ||
212 | 1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) | ||
213 | andi. r0,r8,1 /* pending update ? loop */ | ||
214 | bne- 1b | ||
215 | xor r0,r8,r8 /* create dependency */ | ||
216 | add r9,r9,r0 | ||
217 | |||
218 | /* Load orig stamp (offset to TB) */ | ||
219 | lwz r5,CFG_TB_ORIG_STAMP(r9) | ||
220 | lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) | ||
221 | |||
222 | /* Get a stable TB value */ | ||
223 | 2: mftbu r3 | ||
224 | mftbl r4 | ||
225 | mftbu r0 | ||
226 | cmpl cr0,r3,r0 | ||
227 | bne- 2b | ||
228 | |||
229 | /* Substract tb orig stamp. If the high part is non-zero, we jump to | ||
230 | * the slow path which call the syscall. | ||
231 | * If it's ok, then we have our 32 bits tb_ticks value in r7 | ||
232 | */ | ||
233 | subfc r7,r6,r4 | ||
234 | subfe. r0,r5,r3 | ||
235 | bne- 3f | ||
236 | |||
237 | /* Load scale factor & do multiplication */ | ||
238 | lwz r5,CFG_TB_TO_XS(r9) /* load values */ | ||
239 | lwz r6,(CFG_TB_TO_XS+4)(r9) | ||
240 | mulhwu r4,r7,r5 | ||
241 | mulhwu r6,r7,r6 | ||
242 | mullw r0,r7,r5 | ||
243 | addc r6,r6,r0 | ||
244 | |||
245 | /* At this point, we have the scaled xsec value in r4 + XER:CA | ||
246 | * we load & add the stamp since epoch | ||
247 | */ | ||
248 | lwz r5,CFG_STAMP_XSEC(r9) | ||
249 | lwz r6,(CFG_STAMP_XSEC+4)(r9) | ||
250 | adde r4,r4,r6 | ||
251 | addze r3,r5 | ||
252 | |||
253 | /* We now have our result in r3,r4. We create a fake dependency | ||
254 | * on that result and re-check the counter | ||
255 | */ | ||
256 | or r6,r4,r3 | ||
257 | xor r0,r6,r6 | ||
258 | add r9,r9,r0 | ||
259 | lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) | ||
260 | cmpl cr0,r8,r0 /* check if updated */ | ||
261 | bne- 1b | ||
262 | |||
263 | /* Warning ! The caller expects CR:EQ to be set to indicate a | ||
264 | * successful calculation (so it won't fallback to the syscall | ||
265 | * method). We have overriden that CR bit in the counter check, | ||
266 | * but fortunately, the loop exit condition _is_ CR:EQ set, so | ||
267 | * we can exit safely here. If you change this code, be careful | ||
268 | * of that side effect. | ||
269 | */ | ||
270 | 3: blr | ||
271 | .cfi_endproc | ||
272 | |||
273 | /* | ||
274 | * This is the core of clock_gettime(), it returns the current | ||
275 | * time in seconds and nanoseconds in r3 and r4. | ||
276 | * It expects the datapage ptr in r9 and doesn't clobber it. | 188 | * It expects the datapage ptr in r9 and doesn't clobber it. |
277 | * It clobbers r0, r5, r6, r10 and returns NSEC_PER_SEC in r7. | 189 | * It clobbers r0, r5 and r6. |
278 | * On return, r8 contains the counter value that can be reused. | 190 | * On return, r8 contains the counter value that can be reused. |
279 | * This clobbers cr0 but not any other cr field. | 191 | * This clobbers cr0 but not any other cr field. |
280 | */ | 192 | */ |
@@ -297,70 +209,58 @@ __do_get_tspec: | |||
297 | 2: mftbu r3 | 209 | 2: mftbu r3 |
298 | mftbl r4 | 210 | mftbl r4 |
299 | mftbu r0 | 211 | mftbu r0 |
300 | cmpl cr0,r3,r0 | 212 | cmplw cr0,r3,r0 |
301 | bne- 2b | 213 | bne- 2b |
302 | 214 | ||
303 | /* Subtract tb orig stamp and shift left 12 bits. | 215 | /* Subtract tb orig stamp and shift left 12 bits. |
304 | */ | 216 | */ |
305 | subfc r7,r6,r4 | 217 | subfc r4,r6,r4 |
306 | subfe r0,r5,r3 | 218 | subfe r0,r5,r3 |
307 | slwi r0,r0,12 | 219 | slwi r0,r0,12 |
308 | rlwimi. r0,r7,12,20,31 | 220 | rlwimi. r0,r4,12,20,31 |
309 | slwi r7,r7,12 | 221 | slwi r4,r4,12 |
310 | 222 | ||
311 | /* Load scale factor & do multiplication */ | 223 | /* |
224 | * Load scale factor & do multiplication. | ||
225 | * We only use the high 32 bits of the tb_to_xs value. | ||
226 | * Even with a 1GHz timebase clock, the high 32 bits of | ||
227 | * tb_to_xs will be at least 4 million, so the error from | ||
228 | * ignoring the low 32 bits will be no more than 0.25ppm. | ||
229 | * The error will just make the clock run very very slightly | ||
230 | * slow until the next time the kernel updates the VDSO data, | ||
231 | * at which point the clock will catch up to the kernel's value, | ||
232 | * so there is no long-term error accumulation. | ||
233 | */ | ||
312 | lwz r5,CFG_TB_TO_XS(r9) /* load values */ | 234 | lwz r5,CFG_TB_TO_XS(r9) /* load values */ |
313 | lwz r6,(CFG_TB_TO_XS+4)(r9) | 235 | mulhwu r4,r4,r5 |
314 | mulhwu r3,r7,r6 | ||
315 | mullw r10,r7,r5 | ||
316 | mulhwu r4,r7,r5 | ||
317 | addc r10,r3,r10 | ||
318 | li r3,0 | 236 | li r3,0 |
319 | 237 | ||
320 | beq+ 4f /* skip high part computation if 0 */ | 238 | beq+ 4f /* skip high part computation if 0 */ |
321 | mulhwu r3,r0,r5 | 239 | mulhwu r3,r0,r5 |
322 | mullw r7,r0,r5 | 240 | mullw r5,r0,r5 |
323 | mulhwu r5,r0,r6 | ||
324 | mullw r6,r0,r6 | ||
325 | adde r4,r4,r7 | ||
326 | addze r3,r3 | ||
327 | addc r4,r4,r5 | 241 | addc r4,r4,r5 |
328 | addze r3,r3 | 242 | addze r3,r3 |
329 | addc r10,r10,r6 | 243 | 4: |
330 | 244 | /* At this point, we have seconds since the xtime stamp | |
331 | 4: addze r4,r4 /* add in carry */ | 245 | * as a 32.32 fixed-point number in r3 and r4. |
332 | lis r7,NSEC_PER_SEC@h | 246 | * Load & add the xtime stamp. |
333 | ori r7,r7,NSEC_PER_SEC@l | ||
334 | mulhwu r4,r4,r7 /* convert to nanoseconds */ | ||
335 | |||
336 | /* At this point, we have seconds & nanoseconds since the xtime | ||
337 | * stamp in r3+CA and r4. Load & add the xtime stamp. | ||
338 | */ | 247 | */ |
339 | #ifdef CONFIG_PPC64 | 248 | lwz r5,STAMP_XTIME+TSPEC_TV_SEC(r9) |
340 | lwz r5,STAMP_XTIME+TSPC64_TV_SEC+LOPART(r9) | 249 | lwz r6,STAMP_SEC_FRAC(r9) |
341 | lwz r6,STAMP_XTIME+TSPC64_TV_NSEC+LOPART(r9) | 250 | addc r4,r4,r6 |
342 | #else | ||
343 | lwz r5,STAMP_XTIME+TSPC32_TV_SEC(r9) | ||
344 | lwz r6,STAMP_XTIME+TSPC32_TV_NSEC(r9) | ||
345 | #endif | ||
346 | add r4,r4,r6 | ||
347 | adde r3,r3,r5 | 251 | adde r3,r3,r5 |
348 | 252 | ||
349 | /* We now have our result in r3,r4. We create a fake dependency | 253 | /* We create a fake dependency on the result in r3/r4 |
350 | * on that result and re-check the counter | 254 | * and re-check the counter |
351 | */ | 255 | */ |
352 | or r6,r4,r3 | 256 | or r6,r4,r3 |
353 | xor r0,r6,r6 | 257 | xor r0,r6,r6 |
354 | add r9,r9,r0 | 258 | add r9,r9,r0 |
355 | lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) | 259 | lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) |
356 | cmpl cr0,r8,r0 /* check if updated */ | 260 | cmplw cr0,r8,r0 /* check if updated */ |
357 | bne- 1b | 261 | bne- 1b |
358 | 262 | ||
359 | /* check for nanosecond overflow and adjust if necessary */ | 263 | mulhwu r4,r4,r7 /* convert to micro or nanoseconds */ |
360 | cmpw r4,r7 | ||
361 | bltlr /* all done if no overflow */ | ||
362 | subf r4,r7,r4 /* adjust if overflow */ | ||
363 | addi r3,r3,1 | ||
364 | 264 | ||
365 | blr | 265 | blr |
366 | .cfi_endproc | 266 | .cfi_endproc |