timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC

Tracers want a correlated time between the kernel instrumentation and user space. We really do not want to export sched_clock() to user space, so we need to provide something sensible for this. Using separate data structures with an non blocking sequence count based update mechanism allows us to do that. The data structure required for the readout has a sequence counter and two copies of the timekeeping data. On the update side: smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[0], tk); smp_wmb(); tkf->seq++; smp_wmb(); update(tkf->base[1], tk); On the reader side: do { seq = tkf->seq; smp_rmb(); idx = seq & 0x01; now = now(tkf->base[idx]); smp_rmb(); } while (seq != tkf->seq) So if a NMI hits the update of base[0] it will use base[1] which is still consistent, but this timestamp is not guaranteed to be monotonic across an update. The timestamp is calculated by: now = base_mono + clock_delta * slope So if the update lowers the slope, readers who are forced to the not yet updated second array are still using the old steeper slope. tmono ^ | o n | o n | u | o |o |12345678---> reader order o = old slope u = update n = new slope So reader 6 will observe time going backwards versus reader 5. While other CPUs are likely to be able observe that, the only way for a CPU local observation is when an NMI hits in the middle of the update. Timestamps taken from that NMI context might be ahead of the following timestamps. Callers need to be aware of that and deal with it. V2: Got rid of clock monotonic raw and reorganized the data structures. Folded in the barrier fix from Mathieu. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: John Stultz <john.stultz@linaro.org>
author: Thomas Gleixner <tglx@linutronix.de> 2014-07-16 17:05:23 -0400
committer: John Stultz <john.stultz@linaro.org> 2014-07-23 18:01:55 -0400
commit: 4396e058c52e167729729cf64ea3dfa229637086 (patch)
tree: dcd6a24f5c16d0ed53495aac066b5fea0f88bb5c
parent: 9b0fd802e8c0545148324916055e7b40d97963fa (diff)
2 files changed, 126 insertions, 0 deletions
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 58ad7eff83ff..1caa6b04fdc5 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -164,6 +164,8 @@ static inline u64 ktime_get_raw_ns(void)
        return ktime_to_ns(ktime_get_raw());
 }
+extern u64 ktime_get_mono_fast_ns(void);
 /*
 * Timespec interfaces utilizing the ktime based ones
 */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dee23c9d6c21..8980fb722fc5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq:        Sequence counter for protecting updates. The lowest bit
+ *              is the index for the tk_read_base array
+ * @base:       tk_read_base array. Access is indexed by the lowest bit of
+ *              @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+        seqcount_t              seq;
+        struct tk_read_base     base[2];
+};
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        return nsec + arch_gettimeoffset();
 }
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk:         The timekeeper from which we take the update
+ * @tkf:        The fast timekeeper to update
+ * @tbase:      The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb();   <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb();   <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ *      seq = tkf->seq;
+ *      smp_rmb();
+ *      idx = seq & 0x01;
+ *      now = now(tkf->base[idx]);
+ *      smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+        struct tk_read_base *base = tk_fast_mono.base;
+        /* Force readers off to base[1] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[0] */
+        memcpy(base, &tk->tkr, sizeof(*base));
+        /* Force readers back to base[0] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[1] */
+        memcpy(base + 1, base, sizeof(*base));
+}
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ *      now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * |    o  n
+ * |   o n
+ * |  u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+        struct tk_read_base *tkr;
+        unsigned int seq;
+        u64 now;
+        do {
+                seq = raw_read_seqcount(&tk_fast_mono.seq);
+                tkr = tk_fast_mono.base + (seq & 0x01);
+                now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+        } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+        return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
 static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
+        update_fast_timekeeper(tk);
 }
 /**
author	Thomas Gleixner <tglx@linutronix.de>	2014-07-16 17:05:23 -0400
committer	John Stultz <john.stultz@linaro.org>	2014-07-23 18:01:55 -0400
commit	4396e058c52e167729729cf64ea3dfa229637086 (patch)
tree	dcd6a24f5c16d0ed53495aac066b5fea0f88bb5c
parent	9b0fd802e8c0545148324916055e7b40d97963fa (diff)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 58ad7eff83ff..1caa6b04fdc5 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h
@@ -164,6 +164,8 @@ static inline u64 ktime_get_raw_ns(void)
164	return ktime_to_ns(ktime_get_raw());	164	return ktime_to_ns(ktime_get_raw());
165	}	165	}
166		166
		167	extern u64 ktime_get_mono_fast_ns(void);
		168
167	/*	169	/*
168	* Timespec interfaces utilizing the ktime based ones	170	* Timespec interfaces utilizing the ktime based ones
169	*/	171	*/


diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dee23c9d6c21..8980fb722fc5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
44	static DEFINE_RAW_SPINLOCK(timekeeper_lock);	44	static DEFINE_RAW_SPINLOCK(timekeeper_lock);
45	static struct timekeeper shadow_timekeeper;	45	static struct timekeeper shadow_timekeeper;
46		46
		47	/**
		48	* struct tk_fast - NMI safe timekeeper
		49	* @seq: Sequence counter for protecting updates. The lowest bit
		50	* is the index for the tk_read_base array
		51	* @base: tk_read_base array. Access is indexed by the lowest bit of
		52	* @seq.
		53	*
		54	* See @update_fast_timekeeper() below.
		55	*/
		56	struct tk_fast {
		57	seqcount_t seq;
		58	struct tk_read_base base[2];
		59	};
		60
		61	static struct tk_fast tk_fast_mono ____cacheline_aligned;
		62
47	/* flag for if timekeeping is suspended */	63	/* flag for if timekeeping is suspended */
48	int __read_mostly timekeeping_suspended;	64	int __read_mostly timekeeping_suspended;
49		65
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
210	return nsec + arch_gettimeoffset();	226	return nsec + arch_gettimeoffset();
211	}	227	}
212		228
		229	/**
		230	* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
		231	* @tk: The timekeeper from which we take the update
		232	* @tkf: The fast timekeeper to update
		233	* @tbase: The time base for the fast timekeeper (mono/raw)
		234	*
		235	* We want to use this from any context including NMI and tracing /
		236	* instrumenting the timekeeping code itself.
		237	*
		238	* So we handle this differently than the other timekeeping accessor
		239	* functions which retry when the sequence count has changed. The
		240	* update side does:
		241	*
		242	* smp_wmb(); <- Ensure that the last base[1] update is visible
		243	* tkf->seq++;
		244	* smp_wmb(); <- Ensure that the seqcount update is visible
		245	* update(tkf->base[0], tk);
		246	* smp_wmb(); <- Ensure that the base[0] update is visible
		247	* tkf->seq++;
		248	* smp_wmb(); <- Ensure that the seqcount update is visible
		249	* update(tkf->base[1], tk);
		250	*
		251	* The reader side does:
		252	*
		253	* do {
		254	* seq = tkf->seq;
		255	* smp_rmb();
		256	* idx = seq & 0x01;
		257	* now = now(tkf->base[idx]);
		258	* smp_rmb();
		259	* } while (seq != tkf->seq)
		260	*
		261	* As long as we update base[0] readers are forced off to
		262	* base[1]. Once base[0] is updated readers are redirected to base[0]
		263	* and the base[1] update takes place.
		264	*
		265	* So if a NMI hits the update of base[0] then it will use base[1]
		266	* which is still consistent. In the worst case this can result is a
		267	* slightly wrong timestamp (a few nanoseconds). See
		268	* @ktime_get_mono_fast_ns.
		269	*/
		270	static void update_fast_timekeeper(struct timekeeper *tk)
		271	{
		272	struct tk_read_base *base = tk_fast_mono.base;
		273
		274	/* Force readers off to base[1] */
		275	raw_write_seqcount_latch(&tk_fast_mono.seq);
		276
		277	/* Update base[0] */
		278	memcpy(base, &tk->tkr, sizeof(*base));
		279
		280	/* Force readers back to base[0] */
		281	raw_write_seqcount_latch(&tk_fast_mono.seq);
		282
		283	/* Update base[1] */
		284	memcpy(base + 1, base, sizeof(*base));
		285	}
		286
		287	/**
		288	* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
		289	*
		290	* This timestamp is not guaranteed to be monotonic across an update.
		291	* The timestamp is calculated by:
		292	*
		293	* now = base_mono + clock_delta * slope
		294	*
		295	* So if the update lowers the slope, readers who are forced to the
		296	* not yet updated second array are still using the old steeper slope.
		297	*
		298	* tmono
		299	* ^
		300	* \| o n
		301	* \| o n
		302	* \| u
		303	* \| o
		304	* \|o
		305	* \|12345678---> reader order
		306	*
		307	* o = old slope
		308	* u = update
		309	* n = new slope
		310	*
		311	* So reader 6 will observe time going backwards versus reader 5.
		312	*
		313	* While other CPUs are likely to be able observe that, the only way
		314	* for a CPU local observation is when an NMI hits in the middle of
		315	* the update. Timestamps taken from that NMI context might be ahead
		316	* of the following timestamps. Callers need to be aware of that and
		317	* deal with it.
		318	*/
		319	u64 notrace ktime_get_mono_fast_ns(void)
		320	{
		321	struct tk_read_base *tkr;
		322	unsigned int seq;
		323	u64 now;
		324
		325	do {
		326	seq = raw_read_seqcount(&tk_fast_mono.seq);
		327	tkr = tk_fast_mono.base + (seq & 0x01);
		328	now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
		329
		330	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
		331	return now;
		332	}
		333	EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
		334
213	#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD	335	#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
214		336
215	static inline void update_vsyscall(struct timekeeper *tk)	337	static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
325	if (action & TK_MIRROR)	447	if (action & TK_MIRROR)
326	memcpy(&shadow_timekeeper, &tk_core.timekeeper,	448	memcpy(&shadow_timekeeper, &tk_core.timekeeper,
327	sizeof(tk_core.timekeeper));	449	sizeof(tk_core.timekeeper));
		450
		451	update_fast_timekeeper(tk);
328	}	452	}
329		453
330	/**	454	/**