12 files changed, 377 insertions, 199 deletions
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 2f366d0ac6b4..1da25a5f96f9 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_MWAIT_H
 #define _ASM_X86_MWAIT_H
+#include <linux/sched.h>
 #define MWAIT_SUBSTATE_MASK             0xf
 #define MWAIT_CSTATE_MASK               0xf
 #define MWAIT_SUBSTATE_SIZE             4
@@ -13,4 +15,45 @@
 #define MWAIT_ECX_INTERRUPT_BREAK       0x1
+static inline void __monitor(const void *eax, unsigned long ecx,
+                             unsigned long edx)
+{
+        /* "monitor %eax, %ecx, %edx;" */
+        asm volatile(".byte 0x0f, 0x01, 0xc8;"
+                     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+static inline void __mwait(unsigned long eax, unsigned long ecx)
+{
+        /* "mwait %eax, %ecx;" */
+        asm volatile(".byte 0x0f, 0x01, 0xc9;"
+                     :: "a" (eax), "c" (ecx));
+}
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+{
+        if (!current_set_polling_and_test()) {
+                if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+                        mb();
+                        clflush((void *)&current_thread_info()->flags);
+                        mb();
+                }
+                __monitor((void *)&current_thread_info()->flags, 0, 0);
+                if (!need_resched())
+                        __mwait(eax, ecx);
+        }
+        current_clr_polling();
+}
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7b034a4057f9..24821f5768bc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -700,29 +700,6 @@ static inline void sync_core(void)
 #endif
 }
-static inline void __monitor(const void *eax, unsigned long ecx,
-                             unsigned long edx)
-{
-        /* "monitor %eax, %ecx, %edx;" */
-        asm volatile(".byte 0x0f, 0x01, 0xc8;"
-                     :: "a" (eax), "c" (ecx), "d"(edx));
-}
-static inline void __mwait(unsigned long eax, unsigned long ecx)
-{
-        /* "mwait %eax, %ecx;" */
-        asm volatile(".byte 0x0f, 0x01, 0xc9;"
-                     :: "a" (eax), "c" (ecx));
-}
-static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
-{
-        trace_hardirqs_on();
-        /* "mwait %eax, %ecx;" */
-        asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
-                     :: "a" (eax), "c" (ecx));
-}
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 34baa0eb5d0c..3de54ef0aea5 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -4,6 +4,7 @@
 #include <linux/pm.h>
 #include <linux/percpu.h>
 #include <linux/interrupt.h>
+#include <linux/math64.h>
 #define TICK_SIZE (tick_nsec / 1000)
@@ -12,68 +13,26 @@ extern int recalibrate_cpu_khz(void);
 extern int no_timer_check;
-/* Accelerators for sched_clock()
+/*
- * convert from cycles(64bits) => nanoseconds (64bits)
+ * We use the full linear equation: f(x) = a + b*x, in order to allow
- *  basic equation:
+ * a continuous function in the face of dynamic freq changes.
- *              ns = cycles / (freq / ns_per_sec)
- *              ns = cycles * (ns_per_sec / freq)
- *              ns = cycles * (10^9 / (cpu_khz * 10^3))
- *              ns = cycles * (10^6 / cpu_khz)
 *
- *      Then we use scaling math (suggested by george@mvista.com) to get:
+ * Continuity means that when our frequency changes our slope (b); we want to
- *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ * ensure that: f(t) == f'(t), which gives: a + b*t == a' + b'*t.
- *              ns = cycles * cyc2ns_scale / SC
 *
- *      And since SC is a constant power of two, we can convert the div
+ * Without an offset (a) the above would not be possible.
- *  into a shift.
 *
- *  We can use khz divisor instead of mhz to keep a better precision, since
+ * See the comment near cycles_2_ns() for details on how we compute (b).
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
- *
- * In:
- *
- * ns = cycles * cyc2ns_scale / SC
- *
- * Although we may still have enough bits to store the value of ns,
- * in some cases, we may not have enough bits to store cycles * cyc2ns_scale,
- * leading to an incorrect result.
- *
- * To avoid this, we can decompose 'cycles' into quotient and remainder
- * of division by SC.  Then,
- *
- * ns = (quot * SC + rem) * cyc2ns_scale / SC
- *    = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC
- *
- *                      - sqazi@google.com
 */
+struct cyc2ns_data {
-DECLARE_PER_CPU(unsigned long, cyc2ns);
+        u32 cyc2ns_mul;
-DECLARE_PER_CPU(unsigned long long, cyc2ns_offset);
+        u32 cyc2ns_shift;
+        u64 cyc2ns_offset;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+        u32 __count;
+        /* u32 hole */
-static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
+}; /* 24 bytes -- do not grow */
-{
-        int cpu = smp_processor_id();
+extern struct cyc2ns_data *cyc2ns_read_begin(void);
-        unsigned long long ns = per_cpu(cyc2ns_offset, cpu);
+extern void cyc2ns_read_end(struct cyc2ns_data *);
-        ns += mult_frac(cyc, per_cpu(cyc2ns, cpu),
-                        (1UL << CYC2NS_SCALE_FACTOR));
-        return ns;
-}
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-        unsigned long long ns;
-        unsigned long flags;
-        local_irq_save(flags);
-        ns = __cycles_2_ns(cyc);
-        local_irq_restore(flags);
-        return ns;
-}
 #endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..e69182fd01cf 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-        if (!need_resched()) {
-                if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-                        clflush((void *)&current_thread_info()->flags);
-                __monitor((void *)&current_thread_info()->flags, 0, 0);
-                smp_mb();
-                if (!need_resched())
-                        __mwait(ax, cx);
-        }
-}
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
        unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..8bc79cddd9a2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -487,7 +487,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
                if (!check_tsc_unstable())
-                        sched_clock_stable = 1;
+                        set_sched_clock_stable();
        }
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index ea04b342c026..1a439c047ff3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -93,7 +93,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
                set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
                if (!check_tsc_unstable())
-                        sched_clock_stable = 1;
+                        set_sched_clock_stable();
        }
        /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..b88645191fe5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
 void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
 {
+        struct cyc2ns_data *data;
        userpg->cap_user_time = 0;
        userpg->cap_user_time_zero = 0;
        userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
        userpg->pmc_width = x86_pmu.cntval_bits;
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
+        data = cyc2ns_read_begin();
        userpg->cap_user_time = 1;
-        userpg->time_mult = this_cpu_read(cyc2ns);
+        userpg->time_mult = data->cyc2ns_mul;
-        userpg->time_shift = CYC2NS_SCALE_FACTOR;
+        userpg->time_shift = data->cyc2ns_shift;
-        userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
+        userpg->time_offset = data->cyc2ns_offset - now;
        userpg->cap_user_time_zero = 1;
-        userpg->time_zero = this_cpu_read(cyc2ns_offset);
+        userpg->time_zero = data->cyc2ns_offset;
+        cyc2ns_read_end(data);
 }
 /*
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..f5252c4eec8c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1417,7 +1417,9 @@ static inline void mwait_play_dead(void)
                 * The WBINVD is insufficient due to the spurious-wakeup
                 * case where we return around the loop.
                 */
+                mb();
                clflush(mwait_ptr);
+                mb();
                __monitor(mwait_ptr, 0, 0);
                mb();
                __mwait(eax, 0);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..6377fb28b958 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
 #include <linux/timex.h>
+#include <linux/static_key.h>
 #include <asm/hpet.h>
 #include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
   erroneous rdtsc usage on !cpu_has_tsc processors */
 static int __read_mostly tsc_disabled = -1;
+static struct static_key __use_tsc = STATIC_KEY_INIT;
 int tsc_clocksource_reliable;
+/*
+ * Use a ring-buffer like data structure, where a writer advances the head by
+ * writing a new data entry and a reader advances the tail when it observes a
+ * new entry.
+ *
+ * Writers are made to wait on readers until there's space to write a new
+ * entry.
+ *
+ * This means that we can always use an {offset, mul} pair to compute a ns
+ * value that is 'roughly' in the right direction, even if we're writing a new
+ * {offset, mul} pair during the clock read.
+ *
+ * The down-side is that we can no longer guarantee strict monotonicity anymore
+ * (assuming the TSC was that to begin with), because while we compute the
+ * intersection point of the two clock slopes and make sure the time is
+ * continuous at the point of switching; we can no longer guarantee a reader is
+ * strictly before or after the switch point.
+ *
+ * It does mean a reader no longer needs to disable IRQs in order to avoid
+ * CPU-Freq updates messing with his times, and similarly an NMI reader will
+ * no longer run the risk of hitting half-written state.
+ */
+struct cyc2ns {
+        struct cyc2ns_data data[2];     /*  0 + 2*24 = 48 */
+        struct cyc2ns_data *head;       /* 48 + 8    = 56 */
+        struct cyc2ns_data *tail;       /* 56 + 8    = 64 */
+}; /* exactly fits one cacheline */
+static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
+struct cyc2ns_data *cyc2ns_read_begin(void)
+{
+        struct cyc2ns_data *head;
+        preempt_disable();
+        head = this_cpu_read(cyc2ns.head);
+        /*
+         * Ensure we observe the entry when we observe the pointer to it.
+         * matches the wmb from cyc2ns_write_end().
+         */
+        smp_read_barrier_depends();
+        head->__count++;
+        barrier();
+        return head;
+}
+void cyc2ns_read_end(struct cyc2ns_data *head)
+{
+        barrier();
+        /*
+         * If we're the outer most nested read; update the tail pointer
+         * when we're done. This notifies possible pending writers
+         * that we've observed the head pointer and that the other
+         * entry is now free.
+         */
+        if (!--head->__count) {
+                /*
+                 * x86-TSO does not reorder writes with older reads;
+                 * therefore once this write becomes visible to another
+                 * cpu, we must be finished reading the cyc2ns_data.
+                 *
+                 * matches with cyc2ns_write_begin().
+                 */
+                this_cpu_write(cyc2ns.tail, head);
+        }
+        preempt_enable();
+}
+/*
+ * Begin writing a new @data entry for @cpu.
+ *
+ * Assumes some sort of write side lock; currently 'provided' by the assumption
+ * that cpufreq will call its notifiers sequentially.
+ */
+static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        struct cyc2ns_data *data = c2n->data;
+        if (data == c2n->head)
+                data++;
+        /* XXX send an IPI to @cpu in order to guarantee a read? */
+        /*
+         * When we observe the tail write from cyc2ns_read_end(),
+         * the cpu must be done with that entry and its safe
+         * to start writing to it.
+         */
+        while (c2n->tail == data)
+                cpu_relax();
+        return data;
+}
+static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        /*
+         * Ensure the @data writes are visible before we publish the
+         * entry. Matches the data-depencency in cyc2ns_read_begin().
+         */
+        smp_wmb();
+        ACCESS_ONCE(c2n->head) = data;
+}
+/*
+ * Accelerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *              ns = cycles / (freq / ns_per_sec)
+ *              ns = cycles * (ns_per_sec / freq)
+ *              ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *              ns = cycles * (10^6 / cpu_khz)
+ *
+ *      Then we use scaling math (suggested by george@mvista.com) to get:
+ *              ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *              ns = cycles * cyc2ns_scale / SC
+ *
+ *      And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better precision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+static void cyc2ns_data_init(struct cyc2ns_data *data)
+{
+        data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = 0;
+        data->__count = 0;
+}
+static void cyc2ns_init(int cpu)
+{
+        struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
+        cyc2ns_data_init(&c2n->data[0]);
+        cyc2ns_data_init(&c2n->data[1]);
+        c2n->head = c2n->data;
+        c2n->tail = c2n->data;
+}
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+        struct cyc2ns_data *data, *tail;
+        unsigned long long ns;
+        /*
+         * See cyc2ns_read_*() for details; replicated in order to avoid
+         * an extra few instructions that came with the abstraction.
+         * Notable, it allows us to only do the __count and tail update
+         * dance when its actually needed.
+         */
+        preempt_disable();
+        data = this_cpu_read(cyc2ns.head);
+        tail = this_cpu_read(cyc2ns.tail);
+        if (likely(data == tail)) {
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        } else {
+                data->__count++;
+                barrier();
+                ns = data->cyc2ns_offset;
+                ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+                barrier();
+                if (!--data->__count)
+                        this_cpu_write(cyc2ns.tail, data);
+        }
+        preempt_enable();
+        return ns;
+}
+/* XXX surely we already have this someplace in the kernel?! */
+#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
+static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
+{
+        unsigned long long tsc_now, ns_now;
+        struct cyc2ns_data *data;
+        unsigned long flags;
+        local_irq_save(flags);
+        sched_clock_idle_sleep_event();
+        if (!cpu_khz)
+                goto done;
+        data = cyc2ns_write_begin(cpu);
+        rdtscll(tsc_now);
+        ns_now = cycles_2_ns(tsc_now);
+        /*
+         * Compute a new multiplier as per the above comment and ensure our
+         * time function is continuous; see the comment near struct
+         * cyc2ns_data.
+         */
+        data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
+        data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
+        data->cyc2ns_offset = ns_now -
+                mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
+        cyc2ns_write_end(cpu, data);
+done:
+        sched_clock_idle_wakeup_event(0);
+        local_irq_restore(flags);
+}
 /*
 * Scheduler clock - returns current time in nanosec units.
 */
 u64 native_sched_clock(void)
 {
-        u64 this_offset;
+        u64 tsc_now;
        /*
         * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
         *   very important for it to be as fast as the platform
         *   can achieve it. )
         */
-        if (unlikely(tsc_disabled)) {
+        if (!static_key_false(&__use_tsc)) {
                /* No locking but a rare wrong value is not a big deal: */
                return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
        }
        /* read the Time Stamp Counter: */
-        rdtscll(this_offset);
+        rdtscll(tsc_now);
        /* return the value in ns */
-        return __cycles_2_ns(this_offset);
+        return cycles_2_ns(tsc_now);
 }
 /* We need to define a real function for sched_clock, to override the
@@ -589,61 +821,11 @@ int recalibrate_cpu_khz(void)
 EXPORT_SYMBOL(recalibrate_cpu_khz);
-/* Accelerators for sched_clock()
- * convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *              ns = cycles / (freq / ns_per_sec)
- *              ns = cycles * (ns_per_sec / freq)
- *              ns = cycles * (10^9 / (cpu_khz * 10^3))
- *              ns = cycles * (10^6 / cpu_khz)
- *
- *      Then we use scaling math (suggested by george@mvista.com) to get:
- *              ns = cycles * (10^6 * SC / cpu_khz) / SC
- *              ns = cycles * cyc2ns_scale / SC
- *
- *      And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *
- *  We can use khz divisor instead of mhz to keep a better precision, since
- *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
- *  (mathieu.desnoyers@polymtl.ca)
- *
- *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-DEFINE_PER_CPU(unsigned long, cyc2ns);
-DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
-static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
-{
-        unsigned long long tsc_now, ns_now, *offset;
-        unsigned long flags, *scale;
-        local_irq_save(flags);
-        sched_clock_idle_sleep_event();
-        scale = &per_cpu(cyc2ns, cpu);
-        offset = &per_cpu(cyc2ns_offset, cpu);
-        rdtscll(tsc_now);
-        ns_now = __cycles_2_ns(tsc_now);
-        if (cpu_khz) {
-                *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
-                                cpu_khz / 2) / cpu_khz;
-                *offset = ns_now - mult_frac(tsc_now, *scale,
-                                             (1UL << CYC2NS_SCALE_FACTOR));
-        }
-        sched_clock_idle_wakeup_event(0);
-        local_irq_restore(flags);
-}
 static unsigned long long cyc2ns_suspend;
 void tsc_save_sched_clock_state(void)
 {
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        cyc2ns_suspend = sched_clock();
@@ -663,16 +845,26 @@ void tsc_restore_sched_clock_state(void)
        unsigned long flags;
        int cpu;
-        if (!sched_clock_stable)
+        if (!sched_clock_stable())
                return;
        local_irq_save(flags);
-        __this_cpu_write(cyc2ns_offset, 0);
+        /*
+         * We're comming out of suspend, there's no concurrency yet; don't
+         * bother being nice about the RCU stuff, just write to both
+         * data fields.
+         */
+        this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
+        this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
        offset = cyc2ns_suspend - sched_clock();
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                per_cpu(cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
+                per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
+        }
        local_irq_restore(flags);
 }
@@ -795,7 +987,7 @@ void mark_tsc_unstable(char *reason)
 {
        if (!tsc_unstable) {
                tsc_unstable = 1;
-                sched_clock_stable = 0;
+                clear_sched_clock_stable();
                disable_sched_clock_irqtime();
                pr_info("Marking TSC unstable due to %s\n", reason);
                /* Change only the rating, when not registered */
@@ -995,14 +1187,18 @@ void __init tsc_init(void)
         * speed as the bootup CPU. (cpufreq notifiers will fix this
         * up if their speed diverges)
         */
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
+                cyc2ns_init(cpu);
                set_cyc2ns_scale(cpu_khz, cpu);
+        }
        if (tsc_disabled > 0)
                return;
        /* now allow native_sched_clock() to use rdtsc */
        tsc_disabled = 0;
+        static_key_slow_inc(&__use_tsc);
        if (!no_sched_irq_time)
                enable_sched_clock_irqtime();
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index efe4d7220397..dfe605ac1bcd 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -433,15 +433,49 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
        return;
 }
-static inline unsigned long cycles_2_us(unsigned long long cyc)
+/*
+ * Not to be confused with cycles_2_ns() from tsc.c; this gives a relative
+ * number, not an absolute. It converts a duration in cycles to a duration in
+ * ns.
+ */
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
+        struct cyc2ns_data *data = cyc2ns_read_begin();
        unsigned long long ns;
-        unsigned long us;
-        int cpu = smp_processor_id();
-        ns =  (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
+        ns = mul_u64_u32_shr(cyc, data->cyc2ns_mul, data->cyc2ns_shift);
-        us = ns / 1000;
-        return us;
+        cyc2ns_read_end(data);
+        return ns;
+}
+/*
+ * The reverse of the above; converts a duration in ns to a duration in cycles.
+ */ 
+static inline unsigned long long ns_2_cycles(unsigned long long ns)
+{
+        struct cyc2ns_data *data = cyc2ns_read_begin();
+        unsigned long long cyc;
+        cyc = (ns << data->cyc2ns_shift) / data->cyc2ns_mul;
+        cyc2ns_read_end(data);
+        return cyc;
+}
+static inline unsigned long cycles_2_us(unsigned long long cyc)
+{
+        return cycles_2_ns(cyc) / NSEC_PER_USEC;
+}
+static inline cycles_t sec_2_cycles(unsigned long sec)
+{
+        return ns_2_cycles(sec * NSEC_PER_SEC);
+}
+static inline unsigned long long usec_2_cycles(unsigned long usec)
+{
+        return ns_2_cycles(usec * NSEC_PER_USEC);
 }
 /*
@@ -668,16 +702,6 @@ static int wait_completion(struct bau_desc *bau_desc,
                                                                bcp, try);
 }
-static inline cycles_t sec_2_cycles(unsigned long sec)
-{
-        unsigned long ns;
-        cycles_t cyc;
-        ns = sec * 1000000000;
-        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-        return cyc;
-}
 /*
 * Our retries are blocked by all destination sw ack resources being
 * in use, and a timeout is pending. In that case hardware immediately
@@ -1327,16 +1351,6 @@ static void ptc_seq_stop(struct seq_file *file, void *data)
 {
 }
-static inline unsigned long long usec_2_cycles(unsigned long microsec)
-{
-        unsigned long ns;
-        unsigned long long cyc;
-        ns = microsec * 1000;
-        cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
-        return cyc;
-}
 /*
 * Display the statistics thru /proc/sgi_uv/ptc_statistics
 * 'data' points to the cpu number
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb8380a1c..96bc506ac6de 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,5 @@
 348     i386    process_vm_writev       sys_process_vm_writev           compat_sys_process_vm_writev
 349     i386    kcmp                    sys_kcmp
 350     i386    finit_module            sys_finit_module
+351     i386    sched_setattr           sys_sched_setattr
+352     i386    sched_getattr           sys_sched_getattr
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65dfd14f..a12bddc7ccea 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -320,6 +320,8 @@
 311     64      process_vm_writev       sys_process_vm_writev
 312     common  kcmp                    sys_kcmp
 313     common  finit_module            sys_finit_module
+314     common  sched_setattr           sys_sched_setattr
+315     common  sched_getattr           sys_sched_getattr
 #
 # x32-specific system call numbers start at 512 to avoid cache impact