aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2014-07-16 17:05:23 -0400
committerJohn Stultz <john.stultz@linaro.org>2014-07-23 18:01:55 -0400
commit4396e058c52e167729729cf64ea3dfa229637086 (patch)
treedcd6a24f5c16d0ed53495aac066b5fea0f88bb5c
parent9b0fd802e8c0545148324916055e7b40d97963fa (diff)
timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC
Tracers want a correlated time between the kernel instrumentation and user space. We really do not want to export sched_clock() to user space, so we need to provide something sensible for this. Using separate data structures with an non blocking sequence count based update mechanism allows us to do that. The data structure required for the readout has a sequence counter and two copies of the timekeeping data. On the update side: smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[0], tk); smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[1], tk); On the reader side: do { seq = tkf->seq; smp_rmb(); idx = seq & 0x01; now = now(tkf->base[idx]); smp_rmb(); } while (seq != tkf->seq) So if a NMI hits the update of base[0] it will use base[1] which is still consistent, but this timestamp is not guaranteed to be monotonic across an update. The timestamp is calculated by: now = base_mono + clock_delta * slope So if the update lowers the slope, readers who are forced to the not yet updated second array are still using the old steeper slope. tmono ^ | o n | o n | u | o |o |12345678---> reader order o = old slope u = update n = new slope So reader 6 will observe time going backwards versus reader 5. While other CPUs are likely to be able observe that, the only way for a CPU local observation is when an NMI hits in the middle of the update. Timestamps taken from that NMI context might be ahead of the following timestamps. Callers need to be aware of that and deal with it. V2: Got rid of clock monotonic raw and reorganized the data structures. Folded in the barrier fix from Mathieu. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: John Stultz <john.stultz@linaro.org>
-rw-r--r--include/linux/timekeeping.h2
-rw-r--r--kernel/time/timekeeping.c124
2 files changed, 126 insertions, 0 deletions
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 58ad7eff83ff..1caa6b04fdc5 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -164,6 +164,8 @@ static inline u64 ktime_get_raw_ns(void)
164 return ktime_to_ns(ktime_get_raw()); 164 return ktime_to_ns(ktime_get_raw());
165} 165}
166 166
167extern u64 ktime_get_mono_fast_ns(void);
168
167/* 169/*
168 * Timespec interfaces utilizing the ktime based ones 170 * Timespec interfaces utilizing the ktime based ones
169 */ 171 */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dee23c9d6c21..8980fb722fc5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
44static DEFINE_RAW_SPINLOCK(timekeeper_lock); 44static DEFINE_RAW_SPINLOCK(timekeeper_lock);
45static struct timekeeper shadow_timekeeper; 45static struct timekeeper shadow_timekeeper;
46 46
47/**
48 * struct tk_fast - NMI safe timekeeper
49 * @seq: Sequence counter for protecting updates. The lowest bit
50 * is the index for the tk_read_base array
51 * @base: tk_read_base array. Access is indexed by the lowest bit of
52 * @seq.
53 *
54 * See @update_fast_timekeeper() below.
55 */
56struct tk_fast {
57 seqcount_t seq;
58 struct tk_read_base base[2];
59};
60
61static struct tk_fast tk_fast_mono ____cacheline_aligned;
62
47/* flag for if timekeeping is suspended */ 63/* flag for if timekeeping is suspended */
48int __read_mostly timekeeping_suspended; 64int __read_mostly timekeeping_suspended;
49 65
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
210 return nsec + arch_gettimeoffset(); 226 return nsec + arch_gettimeoffset();
211} 227}
212 228
229/**
230 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
231 * @tk: The timekeeper from which we take the update
232 * @tkf: The fast timekeeper to update
233 * @tbase: The time base for the fast timekeeper (mono/raw)
234 *
235 * We want to use this from any context including NMI and tracing /
236 * instrumenting the timekeeping code itself.
237 *
238 * So we handle this differently than the other timekeeping accessor
239 * functions which retry when the sequence count has changed. The
240 * update side does:
241 *
242 * smp_wmb(); <- Ensure that the last base[1] update is visible
243 * tkf->seq++;
244 * smp_wmb(); <- Ensure that the seqcount update is visible
245 * update(tkf->base[0], tk);
246 * smp_wmb(); <- Ensure that the base[0] update is visible
247 * tkf->seq++;
248 * smp_wmb(); <- Ensure that the seqcount update is visible
249 * update(tkf->base[1], tk);
250 *
251 * The reader side does:
252 *
253 * do {
254 * seq = tkf->seq;
255 * smp_rmb();
256 * idx = seq & 0x01;
257 * now = now(tkf->base[idx]);
258 * smp_rmb();
259 * } while (seq != tkf->seq)
260 *
261 * As long as we update base[0] readers are forced off to
262 * base[1]. Once base[0] is updated readers are redirected to base[0]
263 * and the base[1] update takes place.
264 *
265 * So if a NMI hits the update of base[0] then it will use base[1]
266 * which is still consistent. In the worst case this can result is a
267 * slightly wrong timestamp (a few nanoseconds). See
268 * @ktime_get_mono_fast_ns.
269 */
270static void update_fast_timekeeper(struct timekeeper *tk)
271{
272 struct tk_read_base *base = tk_fast_mono.base;
273
274 /* Force readers off to base[1] */
275 raw_write_seqcount_latch(&tk_fast_mono.seq);
276
277 /* Update base[0] */
278 memcpy(base, &tk->tkr, sizeof(*base));
279
280 /* Force readers back to base[0] */
281 raw_write_seqcount_latch(&tk_fast_mono.seq);
282
283 /* Update base[1] */
284 memcpy(base + 1, base, sizeof(*base));
285}
286
287/**
288 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
289 *
290 * This timestamp is not guaranteed to be monotonic across an update.
291 * The timestamp is calculated by:
292 *
293 * now = base_mono + clock_delta * slope
294 *
295 * So if the update lowers the slope, readers who are forced to the
296 * not yet updated second array are still using the old steeper slope.
297 *
298 * tmono
299 * ^
300 * | o n
301 * | o n
302 * | u
303 * | o
304 * |o
305 * |12345678---> reader order
306 *
307 * o = old slope
308 * u = update
309 * n = new slope
310 *
311 * So reader 6 will observe time going backwards versus reader 5.
312 *
313 * While other CPUs are likely to be able observe that, the only way
314 * for a CPU local observation is when an NMI hits in the middle of
315 * the update. Timestamps taken from that NMI context might be ahead
316 * of the following timestamps. Callers need to be aware of that and
317 * deal with it.
318 */
319u64 notrace ktime_get_mono_fast_ns(void)
320{
321 struct tk_read_base *tkr;
322 unsigned int seq;
323 u64 now;
324
325 do {
326 seq = raw_read_seqcount(&tk_fast_mono.seq);
327 tkr = tk_fast_mono.base + (seq & 0x01);
328 now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
329
330 } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
331 return now;
332}
333EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
334
213#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD 335#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
214 336
215static inline void update_vsyscall(struct timekeeper *tk) 337static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
325 if (action & TK_MIRROR) 447 if (action & TK_MIRROR)
326 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 448 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
327 sizeof(tk_core.timekeeper)); 449 sizeof(tk_core.timekeeper));
450
451 update_fast_timekeeper(tk);
328} 452}
329 453
330/** 454/**