2 files changed, 126 insertions, 0 deletions
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 58ad7eff83ff..1caa6b04fdc5 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -164,6 +164,8 @@ static inline u64 ktime_get_raw_ns(void)
        return ktime_to_ns(ktime_get_raw());
 }
+extern u64 ktime_get_mono_fast_ns(void);
 /*
 * Timespec interfaces utilizing the ktime based ones
 */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dee23c9d6c21..8980fb722fc5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
 static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq:        Sequence counter for protecting updates. The lowest bit
+ *              is the index for the tk_read_base array
+ * @base:       tk_read_base array. Access is indexed by the lowest bit of
+ *              @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+        seqcount_t              seq;
+        struct tk_read_base     base[2];
+};
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
        return nsec + arch_gettimeoffset();
 }
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk:         The timekeeper from which we take the update
+ * @tkf:        The fast timekeeper to update
+ * @tbase:      The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb();   <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb();   <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb();   <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ *      seq = tkf->seq;
+ *      smp_rmb();
+ *      idx = seq & 0x01;
+ *      now = now(tkf->base[idx]);
+ *      smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+        struct tk_read_base *base = tk_fast_mono.base;
+        /* Force readers off to base[1] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[0] */
+        memcpy(base, &tk->tkr, sizeof(*base));
+        /* Force readers back to base[0] */
+        raw_write_seqcount_latch(&tk_fast_mono.seq);
+        /* Update base[1] */
+        memcpy(base + 1, base, sizeof(*base));
+}
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ *      now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * |    o  n
+ * |   o n
+ * |  u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+        struct tk_read_base *tkr;
+        unsigned int seq;
+        u64 now;
+        do {
+                seq = raw_read_seqcount(&tk_fast_mono.seq);
+                tkr = tk_fast_mono.base + (seq & 0x01);
+                now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+        } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+        return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
 static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
        if (action & TK_MIRROR)
                memcpy(&shadow_timekeeper, &tk_core.timekeeper,
                       sizeof(tk_core.timekeeper));
+        update_fast_timekeeper(tk);
 }
 /**

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 58ad7eff83ff..1caa6b04fdc5 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h
@@ -164,6 +164,8 @@ static inline u64 ktime_get_raw_ns(void)
164	return ktime_to_ns(ktime_get_raw());	164	return ktime_to_ns(ktime_get_raw());
165	}	165	}
166		166
		167	extern u64 ktime_get_mono_fast_ns(void);
		168
167	/*	169	/*
168	* Timespec interfaces utilizing the ktime based ones	170	* Timespec interfaces utilizing the ktime based ones
169	*/	171	*/


diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index dee23c9d6c21..8980fb722fc5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
44	static DEFINE_RAW_SPINLOCK(timekeeper_lock);	44	static DEFINE_RAW_SPINLOCK(timekeeper_lock);
45	static struct timekeeper shadow_timekeeper;	45	static struct timekeeper shadow_timekeeper;
46		46
		47	/**
		48	* struct tk_fast - NMI safe timekeeper
		49	* @seq: Sequence counter for protecting updates. The lowest bit
		50	* is the index for the tk_read_base array
		51	* @base: tk_read_base array. Access is indexed by the lowest bit of
		52	* @seq.
		53	*
		54	* See @update_fast_timekeeper() below.
		55	*/
		56	struct tk_fast {
		57	seqcount_t seq;
		58	struct tk_read_base base[2];
		59	};
		60
		61	static struct tk_fast tk_fast_mono ____cacheline_aligned;
		62
47	/* flag for if timekeeping is suspended */	63	/* flag for if timekeeping is suspended */
48	int __read_mostly timekeeping_suspended;	64	int __read_mostly timekeeping_suspended;
49		65
@@ -210,6 +226,112 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
210	return nsec + arch_gettimeoffset();	226	return nsec + arch_gettimeoffset();
211	}	227	}
212		228
		229	/**
		230	* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
		231	* @tk: The timekeeper from which we take the update
		232	* @tkf: The fast timekeeper to update
		233	* @tbase: The time base for the fast timekeeper (mono/raw)
		234	*
		235	* We want to use this from any context including NMI and tracing /
		236	* instrumenting the timekeeping code itself.
		237	*
		238	* So we handle this differently than the other timekeeping accessor
		239	* functions which retry when the sequence count has changed. The
		240	* update side does:
		241	*
		242	* smp_wmb(); <- Ensure that the last base[1] update is visible
		243	* tkf->seq++;
		244	* smp_wmb(); <- Ensure that the seqcount update is visible
		245	* update(tkf->base[0], tk);
		246	* smp_wmb(); <- Ensure that the base[0] update is visible
		247	* tkf->seq++;
		248	* smp_wmb(); <- Ensure that the seqcount update is visible
		249	* update(tkf->base[1], tk);
		250	*
		251	* The reader side does:
		252	*
		253	* do {
		254	* seq = tkf->seq;
		255	* smp_rmb();
		256	* idx = seq & 0x01;
		257	* now = now(tkf->base[idx]);
		258	* smp_rmb();
		259	* } while (seq != tkf->seq)
		260	*
		261	* As long as we update base[0] readers are forced off to
		262	* base[1]. Once base[0] is updated readers are redirected to base[0]
		263	* and the base[1] update takes place.
		264	*
		265	* So if a NMI hits the update of base[0] then it will use base[1]
		266	* which is still consistent. In the worst case this can result is a
		267	* slightly wrong timestamp (a few nanoseconds). See
		268	* @ktime_get_mono_fast_ns.
		269	*/
		270	static void update_fast_timekeeper(struct timekeeper *tk)
		271	{
		272	struct tk_read_base *base = tk_fast_mono.base;
		273
		274	/* Force readers off to base[1] */
		275	raw_write_seqcount_latch(&tk_fast_mono.seq);
		276
		277	/* Update base[0] */
		278	memcpy(base, &tk->tkr, sizeof(*base));
		279
		280	/* Force readers back to base[0] */
		281	raw_write_seqcount_latch(&tk_fast_mono.seq);
		282
		283	/* Update base[1] */
		284	memcpy(base + 1, base, sizeof(*base));
		285	}
		286
		287	/**
		288	* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
		289	*
		290	* This timestamp is not guaranteed to be monotonic across an update.
		291	* The timestamp is calculated by:
		292	*
		293	* now = base_mono + clock_delta * slope
		294	*
		295	* So if the update lowers the slope, readers who are forced to the
		296	* not yet updated second array are still using the old steeper slope.
		297	*
		298	* tmono
		299	* ^
		300	* \| o n
		301	* \| o n
		302	* \| u
		303	* \| o
		304	* \|o
		305	* \|12345678---> reader order
		306	*
		307	* o = old slope
		308	* u = update
		309	* n = new slope
		310	*
		311	* So reader 6 will observe time going backwards versus reader 5.
		312	*
		313	* While other CPUs are likely to be able observe that, the only way
		314	* for a CPU local observation is when an NMI hits in the middle of
		315	* the update. Timestamps taken from that NMI context might be ahead
		316	* of the following timestamps. Callers need to be aware of that and
		317	* deal with it.
		318	*/
		319	u64 notrace ktime_get_mono_fast_ns(void)
		320	{
		321	struct tk_read_base *tkr;
		322	unsigned int seq;
		323	u64 now;
		324
		325	do {
		326	seq = raw_read_seqcount(&tk_fast_mono.seq);
		327	tkr = tk_fast_mono.base + (seq & 0x01);
		328	now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
		329
		330	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
		331	return now;
		332	}
		333	EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
		334
213	#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD	335	#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
214		336
215	static inline void update_vsyscall(struct timekeeper *tk)	337	static inline void update_vsyscall(struct timekeeper *tk)
@@ -325,6 +447,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
325	if (action & TK_MIRROR)	447	if (action & TK_MIRROR)
326	memcpy(&shadow_timekeeper, &tk_core.timekeeper,	448	memcpy(&shadow_timekeeper, &tk_core.timekeeper,
327	sizeof(tk_core.timekeeper));	449	sizeof(tk_core.timekeeper));
		450
		451	update_fast_timekeeper(tk);
328	}	452	}
329		453
330	/**	454	/**