1 files changed, 265 insertions, 65 deletions
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..57e5ce126d5a 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
+#include <linux/static_key.h>
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
   erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
+static struct static_key __use_tsc = STATIC_KEY_INIT;
 int tsc_clocksource_reliable;
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+struct cyc2ns {
+        struct cyc2ns_data data[2];     /*  0 + 2*24 = 48 */
+        struct cyc2ns_data *head;       /* 48 + 8    = 56 */
+        struct cyc2ns_data *tail;       /* 56 + 8    = 64 */
+}; /* exactly fits one cacheline */
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+        struct cyc2ns_data *head;
+        preempt_disable();
+        head = this_cpu_read(cyc2ns.head);
+        /*
+         * Ensure we observe the entry when we observe the pointer to it.
+         * matches the wmb from cyc2ns_write_end().
+         */
+        smp_read_barrier_depends();
+        head->__count++;
+        barrier();
+        return head;
+}
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+        barrier();
+        /*
+         * If we're the outer most nested read; update the tail pointer
+         * when we're done. This notifies possible pending writers
+         * that we've observed the head pointer and that the other
+         * entry is now free.
+         */
+        if (!--head->__count) {
+                /*
+                 * x86-TSO does not reorder writes with older reads;
+                 * therefore once this write becomes visible to another
+                 * cpu, we must be finished reading the cyc2ns_data.
+                 *
+                 * matches with cyc2ns_write_begin().
+                 */
+                this_cpu_write(cyc2ns.tail, head);
+        }
+        preempt_enable();
+}
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        struct cyc2ns_data *data = c2n->data;
+        if (data == c2n->head)
+                data++;
+        /* XXX send an IPI to @cpu in order to guarantee a read? */
+        /*
+         * When we observe the tail write from cyc2ns_read_end(),
+         * the cpu must be done with that entry and its safe
+         * to start writing to it.
+         */
+        while (c2n->tail == data)
+                cpu_relax();
+        return data;
+}
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        /*
+         * Ensure the @data writes are visible before we publish the
+         * entry. Matches the data-depencency in cyc2ns_read_begin().
+         */
+        smp_wmb();
+        ACCESS_ONCE(c2n->head) = data;
+}
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+        data->cyc2ns_mul = 0;
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = 0;
+        data->__count = 0;
+}
+static void cyc2ns_init(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        cyc2ns_data_init(&c2n->data[0]);
+        cyc2ns_data_init(&c2n->data[1]);
+        c2n->head = c2n->data;
+        c2n->tail = c2n->data;
+}
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        struct cyc2ns_data *data, *tail;
+        unsigned long long ns;
+        /*
+         * See cyc2ns_read_*() for details; replicated in order to avoid
+         * an extra few instructions that came with the abstraction.
+         * Notable, it allows us to only do the __count and tail update
+         * dance when its actually needed.
+         */
+        preempt_disable_notrace();
+        data = this_cpu_read(cyc2ns.head);
+        tail = this_cpu_read(cyc2ns.tail);
+        if (likely(data == tail)) {
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        } else {
+                data->__count++;
+                barrier();
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+                barrier();
+                if (!--data->__count)
+                        this_cpu_write(cyc2ns.tail, data);
+        }
+        preempt_enable_notrace();
+        return ns;
+}
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+        unsigned long long tsc_now, ns_now;
+        struct cyc2ns_data *data;
+        unsigned long flags;
+        local_irq_save(flags);
+        sched_clock_idle_sleep_event();
+        if (!cpu_khz)
+                goto done;
+        data = cyc2ns_write_begin(cpu);
+        rdtscll(tsc_now);
+        ns_now = cycles_2_ns(tsc_now);
+        /*
+         * Compute a new multiplier as per the above comment and ensure our
+         * time function is continuous; see the comment near struct
+         * cyc2ns_data.
+         */
+        data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = ns_now -
+                mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        cyc2ns_write_end(cpu, data);
+done:
+        sched_clock_idle_wakeup_event(0);
+        local_irq_restore(flags);
+}
 /*
 * Scheduler clock - returns current time in nanosec units.
 */
 u64 native_sched_clock(void)
 {
-        u64 this_offset;
+        u64 tsc_now;
        /*
         * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */
-        if (unlikely(tsc_disabled)) {
+        if (!static_key_false(&__use_tsc)) {
                /* No locking but a rare wrong value is not a big deal: */
                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
        }
        /* read the Time Stamp Counter: */
-        rdtscll(this_offset);
+        rdtscll(tsc_now);
        /* return the value in ns */
-        return __cycles_2_ns(this_offset);
+        return cycles_2_ns(tsc_now);
 }
 /* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,13 @@ unsigned long native_calibrate_tsc(void)
        unsigned long flags, latch, ms, fast_calibrate;
        int hpet = is_hpet_enabled(), i, loopmin;
+        /* Calibrate TSC using MSR for Intel Atom SoCs */
+        local_irq_save(flags);
+        fast_calibrate = try_msr_calibrate_tsc();
+        local_irq_restore(flags);
+        if (fast_calibrate)
+                return fast_calibrate;
        local_irq_save(flags);
        fast_calibrate = quick_pit_calibrate();
        local_irq_restore(flags);
@@ -589,61 +828,11 @@ int recalibrate_cpu_khz(void)
 EXPORT_SYMBOL(recalibrate_cpu_khz);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *              ns = cycles / (freq / ns_per_sec)
- *              ns = cycles * (ns_per_sec / freq)
- *              ns = cycles * (10^9 / (cpu_khz * 10^3))
- *              ns = cycles * (10^6 / cpu_khz)
- *
- *      Then we use scaling math (suggested by george@mvista.com) to get:
- *              ns = cycles * (10^6 * SC / cpu_khz) / SC
- *              ns = cycles * cyc2ns_scale / SC
- *
- *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-        unsigned long long tsc_now, ns_now, *offset;
-        unsigned long flags, *scale;
-        local_irq_save(flags);
-        sched_clock_idle_sleep_event();
-        scale = &per_cpu(cyc2ns, cpu);
-        offset = &per_cpu(cyc2ns_offset, cpu);
-        rdtscll(tsc_now);
-        ns_now = __cycles_2_ns(tsc_now);
-        if (cpu_khz) {
-                *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
-                                cpu_khz / 2) / cpu_khz;
-                *offset = ns_now - mult_frac(tsc_now, *scale,
-                                             (1UL << CYC2NS_SCALE_FACTOR));
-        }
-        sched_clock_idle_wakeup_event(0);
-        local_irq_restore(flags);
-}
 static unsigned long long cyc2ns_suspend;
 void tsc_save_sched_clock_state(void)
 {
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        cyc2ns_suspend = sched_clock();
@@ -663,16 +852,26 @@ void tsc_restore_sched_clock_state(void)
        unsigned long flags;
        int cpu;
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        local_irq_save(flags);
-        __this_cpu_write(cyc2ns_offset, 0);
+        /*
+         * We're comming out of suspend, there's no concurrency yet; don't
+         * bother being nice about the RCU stuff, just write to both
+         * data fields.
+         */
+        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
        offset = cyc2ns_suspend - sched_clock();
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                per_cpu(cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+        }
        local_irq_restore(flags);
 }
@@ -715,8 +914,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
                tsc_khz_ref = tsc_khz;
        }
        if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-                        (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+                        (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
-                        (val == CPUFREQ_RESUMECHANGE)) {
                *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
                tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
@@ -786,16 +984,14 @@ static struct clocksource clocksource_tsc = {
        .mask                   = CLOCKSOURCE_MASK(64),
        .flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
                                  CLOCK_SOURCE_MUST_VERIFY,
-#ifdef CONFIG_X86_64
        .archdata               = { .vclock_mode = VCLOCK_TSC },
-#endif
 };
 void mark_tsc_unstable(char *reason)
 {
        if (!tsc_unstable) {
                tsc_unstable = 1;
-                sched_clock_stable = 0;
+                clear_sched_clock_stable();
                disable_sched_clock_irqtime();
                pr_info("Marking TSC unstable due to %s\n", reason);
                /* Change only the rating, when not registered */
@@ -995,14 +1191,18 @@ void __init tsc_init(void)
         * speed as the bootup CPU. (cpufreq notifiers will fix this
         * up if their speed diverges)
         */
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
+                cyc2ns_init(cpu);
                set_cyc2ns_scale(cpu_khz, cpu);
+        }
        if (tsc_disabled > 0)
                return;
        /* now allow native_sched_clock() to use rdtsc */
        tsc_disabled = 0;
+        static_key_slow_inc(&__use_tsc);
        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f560..57e5ce126d5a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
11	#include <linux/clocksource.h>	11	#include <linux/clocksource.h>
12	#include <linux/percpu.h>	12	#include <linux/percpu.h>
13	#include <linux/timex.h>	13	#include <linux/timex.h>
		14	#include <linux/static_key.h>
14		15
15	#include <asm/hpet.h>	16	#include <asm/hpet.h>
16	#include <asm/timer.h>	17	#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
37	erroneous rdtsc usage on !cpu_has_tsc processors */	38	erroneous rdtsc usage on !cpu_has_tsc processors */
38	static int __read_mostly tsc_disabled = -1;	39	static int __read_mostly tsc_disabled = -1;
39		40
		41	static struct static_key __use_tsc = STATIC_KEY_INIT;
		42
40	int tsc_clocksource_reliable;	43	int tsc_clocksource_reliable;
		44
		45	/*
		46	* Use a ring-buffer like data structure, where a writer advances the head by
		47	* writing a new data entry and a reader advances the tail when it observes a
		48	* new entry.
		49	*
		50	* Writers are made to wait on readers until there's space to write a new
		51	* entry.
		52	*
		53	* This means that we can always use an {offset, mul} pair to compute a ns
		54	* value that is 'roughly' in the right direction, even if we're writing a new
		55	* {offset, mul} pair during the clock read.
		56	*
		57	* The down-side is that we can no longer guarantee strict monotonicity anymore
		58	* (assuming the TSC was that to begin with), because while we compute the
		59	* intersection point of the two clock slopes and make sure the time is
		60	* continuous at the point of switching; we can no longer guarantee a reader is
		61	* strictly before or after the switch point.
		62	*
		63	* It does mean a reader no longer needs to disable IRQs in order to avoid
		64	* CPU-Freq updates messing with his times, and similarly an NMI reader will
		65	* no longer run the risk of hitting half-written state.
		66	*/
		67
		68	struct cyc2ns {
		69	struct cyc2ns_data data[2]; /* 0 + 224 = 48 /
		70	struct cyc2ns_data head; / 48 + 8 = 56 */
		71	struct cyc2ns_data tail; / 56 + 8 = 64 */
		72	}; /* exactly fits one cacheline */
		73
		74	static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
		75
		76	struct cyc2ns_data *cyc2ns_read_begin(void)
		77	{
		78	struct cyc2ns_data *head;
		79
		80	preempt_disable();
		81
		82	head = this_cpu_read(cyc2ns.head);
		83	/*
		84	* Ensure we observe the entry when we observe the pointer to it.
		85	* matches the wmb from cyc2ns_write_end().
		86	*/
		87	smp_read_barrier_depends();
		88	head->__count++;
		89	barrier();
		90
		91	return head;
		92	}
		93
		94	void cyc2ns_read_end(struct cyc2ns_data *head)
		95	{
		96	barrier();
		97	/*
		98	* If we're the outer most nested read; update the tail pointer
		99	* when we're done. This notifies possible pending writers
		100	* that we've observed the head pointer and that the other
		101	* entry is now free.
		102	*/
		103	if (!--head->__count) {
		104	/*
		105	* x86-TSO does not reorder writes with older reads;
		106	* therefore once this write becomes visible to another
		107	* cpu, we must be finished reading the cyc2ns_data.
		108	*
		109	* matches with cyc2ns_write_begin().
		110	*/
		111	this_cpu_write(cyc2ns.tail, head);
		112	}
		113	preempt_enable();
		114	}
		115
		116	/*
		117	* Begin writing a new @data entry for @cpu.
		118	*
		119	* Assumes some sort of write side lock; currently 'provided' by the assumption
		120	* that cpufreq will call its notifiers sequentially.
		121	*/
		122	static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
		123	{
		124	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
		125	struct cyc2ns_data *data = c2n->data;
		126
		127	if (data == c2n->head)
		128	data++;
		129
		130	/* XXX send an IPI to @cpu in order to guarantee a read? */
		131
		132	/*
		133	* When we observe the tail write from cyc2ns_read_end(),
		134	* the cpu must be done with that entry and its safe
		135	* to start writing to it.
		136	*/
		137	while (c2n->tail == data)
		138	cpu_relax();
		139
		140	return data;
		141	}
		142
		143	static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
		144	{
		145	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
		146
		147	/*
		148	* Ensure the @data writes are visible before we publish the
		149	* entry. Matches the data-depencency in cyc2ns_read_begin().
		150	*/
		151	smp_wmb();
		152
		153	ACCESS_ONCE(c2n->head) = data;
		154	}
		155
		156	/*
		157	* Accelerators for sched_clock()
		158	* convert from cycles(64bits) => nanoseconds (64bits)
		159	* basic equation:
		160	* ns = cycles / (freq / ns_per_sec)
		161	* ns = cycles * (ns_per_sec / freq)
		162	* ns = cycles * (10^9 / (cpu_khz * 10^3))
		163	* ns = cycles * (10^6 / cpu_khz)
		164	*
		165	* Then we use scaling math (suggested by george@mvista.com) to get:
		166	* ns = cycles * (10^6 * SC / cpu_khz) / SC
		167	* ns = cycles * cyc2ns_scale / SC
		168	*
		169	* And since SC is a constant power of two, we can convert the div
		170	* into a shift.
		171	*
		172	* We can use khz divisor instead of mhz to keep a better precision, since
		173	* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
		174	* (mathieu.desnoyers@polymtl.ca)
		175	*
		176	* -johnstul@us.ibm.com "math is hard, lets go shopping!"
		177	*/
		178
		179	#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
		180
		181	static void cyc2ns_data_init(struct cyc2ns_data *data)
		182	{
		183	data->cyc2ns_mul = 0;
		184	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
		185	data->cyc2ns_offset = 0;
		186	data->__count = 0;
		187	}
		188
		189	static void cyc2ns_init(int cpu)
		190	{
		191	struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
		192
		193	cyc2ns_data_init(&c2n->data[0]);
		194	cyc2ns_data_init(&c2n->data[1]);
		195
		196	c2n->head = c2n->data;
		197	c2n->tail = c2n->data;
		198	}
		199
		200	static inline unsigned long long cycles_2_ns(unsigned long long cyc)
		201	{
		202	struct cyc2ns_data data, tail;
		203	unsigned long long ns;
		204
		205	/*
		206	* See cyc2ns_read_*() for details; replicated in order to avoid
		207	* an extra few instructions that came with the abstraction.
		208	* Notable, it allows us to only do the __count and tail update
		209	* dance when its actually needed.
		210	*/
		211
		212	preempt_disable_notrace();
		213	data = this_cpu_read(cyc2ns.head);
		214	tail = this_cpu_read(cyc2ns.tail);
		215
		216	if (likely(data == tail)) {
		217	ns = data->cyc2ns_offset;
		218	ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
		219	} else {
		220	data->__count++;
		221
		222	barrier();
		223
		224	ns = data->cyc2ns_offset;
		225	ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
		226
		227	barrier();
		228
		229	if (!--data->__count)
		230	this_cpu_write(cyc2ns.tail, data);
		231	}
		232	preempt_enable_notrace();
		233
		234	return ns;
		235	}
		236
		237	/* XXX surely we already have this someplace in the kernel?! */
		238	#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
		239
		240	static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
		241	{
		242	unsigned long long tsc_now, ns_now;
		243	struct cyc2ns_data *data;
		244	unsigned long flags;
		245
		246	local_irq_save(flags);
		247	sched_clock_idle_sleep_event();
		248
		249	if (!cpu_khz)
		250	goto done;
		251
		252	data = cyc2ns_write_begin(cpu);
		253
		254	rdtscll(tsc_now);
		255	ns_now = cycles_2_ns(tsc_now);
		256
		257	/*
		258	* Compute a new multiplier as per the above comment and ensure our
		259	* time function is continuous; see the comment near struct
		260	* cyc2ns_data.
		261	*/
		262	data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
		263	data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
		264	data->cyc2ns_offset = ns_now -
		265	mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
		266
		267	cyc2ns_write_end(cpu, data);
		268
		269	done:
		270	sched_clock_idle_wakeup_event(0);
		271	local_irq_restore(flags);
		272	}
41	/*	273	/*
42	* Scheduler clock - returns current time in nanosec units.	274	* Scheduler clock - returns current time in nanosec units.
43	*/	275	*/
44	u64 native_sched_clock(void)	276	u64 native_sched_clock(void)
45	{	277	{
46	u64 this_offset;	278	u64 tsc_now;
47		279
48	/*	280	/*
49	* Fall back to jiffies if there's no TSC available:	281	* Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
53	* very important for it to be as fast as the platform	285	* very important for it to be as fast as the platform
54	* can achieve it. )	286	* can achieve it. )
55	*/	287	*/
56	if (unlikely(tsc_disabled)) {	288	if (!static_key_false(&__use_tsc)) {
57	/* No locking but a rare wrong value is not a big deal: */	289	/* No locking but a rare wrong value is not a big deal: */
58	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);	290	return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
59	}	291	}
60		292
61	/* read the Time Stamp Counter: */	293	/* read the Time Stamp Counter: */
62	rdtscll(this_offset);	294	rdtscll(tsc_now);
63		295
64	/* return the value in ns */	296	/* return the value in ns */
65	return __cycles_2_ns(this_offset);	297	return cycles_2_ns(tsc_now);
66	}	298	}
67		299
68	/* We need to define a real function for sched_clock, to override the	300	/* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,13 @@ unsigned long native_calibrate_tsc(void)
419	unsigned long flags, latch, ms, fast_calibrate;	651	unsigned long flags, latch, ms, fast_calibrate;
420	int hpet = is_hpet_enabled(), i, loopmin;	652	int hpet = is_hpet_enabled(), i, loopmin;
421		653
		654	/* Calibrate TSC using MSR for Intel Atom SoCs */
		655	local_irq_save(flags);
		656	fast_calibrate = try_msr_calibrate_tsc();
		657	local_irq_restore(flags);
		658	if (fast_calibrate)
		659	return fast_calibrate;
		660
422	local_irq_save(flags);	661	local_irq_save(flags);
423	fast_calibrate = quick_pit_calibrate();	662	fast_calibrate = quick_pit_calibrate();
424	local_irq_restore(flags);	663	local_irq_restore(flags);
@@ -589,61 +828,11 @@ int recalibrate_cpu_khz(void)
589	EXPORT_SYMBOL(recalibrate_cpu_khz);	828	EXPORT_SYMBOL(recalibrate_cpu_khz);
590		829
591		830
592	/* Accelerators for sched_clock()
593	* convert from cycles(64bits) => nanoseconds (64bits)
594	* basic equation:
595	* ns = cycles / (freq / ns_per_sec)
596	* ns = cycles * (ns_per_sec / freq)
597	* ns = cycles * (10^9 / (cpu_khz * 10^3))
598	* ns = cycles * (10^6 / cpu_khz)
599	*
600	* Then we use scaling math (suggested by george@mvista.com) to get:
601	* ns = cycles * (10^6 * SC / cpu_khz) / SC
602	* ns = cycles * cyc2ns_scale / SC
603	*
604	* And since SC is a constant power of two, we can convert the div
605	* into a shift.
606	*
607	* We can use khz divisor instead of mhz to keep a better precision, since
608	* cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
609	* (mathieu.desnoyers@polymtl.ca)
610	*
611	* -johnstul@us.ibm.com "math is hard, lets go shopping!"
612	*/
613
614	DEFINE_PER_CPU(unsigned long, cyc2ns);
615	DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
616
617	static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
618	{
619	unsigned long long tsc_now, ns_now, *offset;
620	unsigned long flags, *scale;
621
622	local_irq_save(flags);
623	sched_clock_idle_sleep_event();
624
625	scale = &per_cpu(cyc2ns, cpu);
626	offset = &per_cpu(cyc2ns_offset, cpu);
627
628	rdtscll(tsc_now);
629	ns_now = __cycles_2_ns(tsc_now);
630
631	if (cpu_khz) {
632	*scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
633	cpu_khz / 2) / cpu_khz;
634	offset = ns_now - mult_frac(tsc_now, scale,
635	(1UL << CYC2NS_SCALE_FACTOR));
636	}
637
638	sched_clock_idle_wakeup_event(0);
639	local_irq_restore(flags);
640	}
641
642	static unsigned long long cyc2ns_suspend;	831	static unsigned long long cyc2ns_suspend;
643		832
644	void tsc_save_sched_clock_state(void)	833	void tsc_save_sched_clock_state(void)
645	{	834	{
646	if (!sched_clock_stable)	835	if (!sched_clock_stable())
647	return;	836	return;
648		837
649	cyc2ns_suspend = sched_clock();	838	cyc2ns_suspend = sched_clock();
@@ -663,16 +852,26 @@ void tsc_restore_sched_clock_state(void)
663	unsigned long flags;	852	unsigned long flags;
664	int cpu;	853	int cpu;
665		854
666	if (!sched_clock_stable)	855	if (!sched_clock_stable())
667	return;	856	return;
668		857
669	local_irq_save(flags);	858	local_irq_save(flags);
670		859
671	__this_cpu_write(cyc2ns_offset, 0);	860	/*
		861	* We're comming out of suspend, there's no concurrency yet; don't
		862	* bother being nice about the RCU stuff, just write to both
		863	* data fields.
		864	*/
		865
		866	this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
		867	this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
		868
672	offset = cyc2ns_suspend - sched_clock();	869	offset = cyc2ns_suspend - sched_clock();
673		870
674	for_each_possible_cpu(cpu)	871	for_each_possible_cpu(cpu) {
675	per_cpu(cyc2ns_offset, cpu) = offset;	872	per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
		873	per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
		874	}
676		875
677	local_irq_restore(flags);	876	local_irq_restore(flags);
678	}	877	}
@@ -715,8 +914,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
715	tsc_khz_ref = tsc_khz;	914	tsc_khz_ref = tsc_khz;
716	}	915	}
717	if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) \|\|	916	if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) \|\|
718	(val == CPUFREQ_POSTCHANGE && freq->old > freq->new) \|\|	917	(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
719	(val == CPUFREQ_RESUMECHANGE)) {
720	*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);	918	*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
721		919
722	tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);	920	tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
@@ -786,16 +984,14 @@ static struct clocksource clocksource_tsc = {
786	.mask = CLOCKSOURCE_MASK(64),	984	.mask = CLOCKSOURCE_MASK(64),
787	.flags = CLOCK_SOURCE_IS_CONTINUOUS \|	985	.flags = CLOCK_SOURCE_IS_CONTINUOUS \|
788	CLOCK_SOURCE_MUST_VERIFY,	986	CLOCK_SOURCE_MUST_VERIFY,
789	#ifdef CONFIG_X86_64
790	.archdata = { .vclock_mode = VCLOCK_TSC },	987	.archdata = { .vclock_mode = VCLOCK_TSC },
791	#endif
792	};	988	};
793		989
794	void mark_tsc_unstable(char *reason)	990	void mark_tsc_unstable(char *reason)
795	{	991	{
796	if (!tsc_unstable) {	992	if (!tsc_unstable) {
797	tsc_unstable = 1;	993	tsc_unstable = 1;
798	sched_clock_stable = 0;	994	clear_sched_clock_stable();
799	disable_sched_clock_irqtime();	995	disable_sched_clock_irqtime();
800	pr_info("Marking TSC unstable due to %s\n", reason);	996	pr_info("Marking TSC unstable due to %s\n", reason);
801	/* Change only the rating, when not registered */	997	/* Change only the rating, when not registered */
@@ -995,14 +1191,18 @@ void __init tsc_init(void)
995	* speed as the bootup CPU. (cpufreq notifiers will fix this	1191	* speed as the bootup CPU. (cpufreq notifiers will fix this
996	* up if their speed diverges)	1192	* up if their speed diverges)
997	*/	1193	*/
998	for_each_possible_cpu(cpu)	1194	for_each_possible_cpu(cpu) {
		1195	cyc2ns_init(cpu);
999	set_cyc2ns_scale(cpu_khz, cpu);	1196	set_cyc2ns_scale(cpu_khz, cpu);
		1197	}
1000		1198
1001	if (tsc_disabled > 0)	1199	if (tsc_disabled > 0)
1002	return;	1200	return;
1003		1201
1004	/* now allow native_sched_clock() to use rdtsc */	1202	/* now allow native_sched_clock() to use rdtsc */
		1203
1005	tsc_disabled = 0;	1204	tsc_disabled = 0;
		1205	static_key_slow_inc(&__use_tsc);
1006		1206
1007	if (!no_sched_irq_time)	1207	if (!no_sched_irq_time)
1008	enable_sched_clock_irqtime();	1208	enable_sched_clock_irqtime();