1 files changed, 472 insertions, 76 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
 #include <trace/events/kvm.h>
 #define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 u64 __read_mostly host_xcr0;
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
        if (is_long_mode(vcpu)) {
-                if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+                if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
                        if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
                                return 1;
                } else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 static const u32 emulated_msrs[] = {
+        MSR_IA32_TSC_ADJUST,
        MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+        return kvm_x86_ops->set_msr(vcpu, msr);
 }
 /*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 */
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
-        return kvm_set_msr(vcpu, index, *data);
+        struct msr_data msr;
+        msr.data = *data;
+        msr.index = index;
+        msr.host_initiated = true;
+        return kvm_set_msr(vcpu, &msr);
 }
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+        seqcount_t      seq;
+        struct { /* extract of a clocksource struct */
+                int vclock_mode;
+                cycle_t cycle_last;
+                cycle_t mask;
+                u32     mult;
+                u32     shift;
+        } clock;
+        /* open coded 'struct timespec' */
+        u64             monotonic_time_snsec;
+        time_t          monotonic_time_sec;
+};
+static struct pvclock_gtod_data pvclock_gtod_data;
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+        write_seqcount_begin(&vdata->seq);
+        /* copy pvclock gtod data */
+        vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
+        vdata->clock.cycle_last         = tk->clock->cycle_last;
+        vdata->clock.mask               = tk->clock->mask;
+        vdata->clock.mult               = tk->mult;
+        vdata->clock.shift              = tk->shift;
+        vdata->monotonic_time_sec       = tk->xtime_sec
+                                        + tk->wall_to_monotonic.tv_sec;
+        vdata->monotonic_time_snsec     = tk->xtime_nsec
+                                        + (tk->wall_to_monotonic.tv_nsec
+                                                << tk->shift);
+        while (vdata->monotonic_time_snsec >=
+                                        (((u64)NSEC_PER_SEC) << tk->shift)) {
+                vdata->monotonic_time_snsec -=
+                                        ((u64)NSEC_PER_SEC) << tk->shift;
+                vdata->monotonic_time_sec++;
+        }
+        write_seqcount_end(&vdata->seq);
+}
+#endif
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
        return timespec_to_ns(&ts);
 }
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
        return tsc;
 }
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+        bool vcpus_matched;
+        bool do_request = false;
+        struct kvm_arch *ka = &vcpu->kvm->arch;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                         atomic_read(&vcpu->kvm->online_vcpus));
+        if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+                if (!ka->use_master_clock)
+                        do_request = 1;
+        if (!vcpus_matched && ka->use_master_clock)
+                        do_request = 1;
+        if (do_request)
+                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+                            atomic_read(&vcpu->kvm->online_vcpus),
+                            ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+        u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
        s64 usdiff;
+        bool matched;
+        u64 data = msr->data;
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
+                matched = true;
        } else {
                /*
                 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                kvm->arch.cur_tsc_nsec = ns;
                kvm->arch.cur_tsc_write = data;
                kvm->arch.cur_tsc_offset = offset;
+                matched = false;
                pr_debug("kvm: new tsc generation %u, clock %llu\n",
                         kvm->arch.cur_tsc_generation, data);
        }
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+                update_ia32_tsc_adjust_msr(vcpu, offset);
        kvm_x86_ops->write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+        if (matched)
+                kvm->arch.nr_vcpus_matched_tsc++;
+        else
+                kvm->arch.nr_vcpus_matched_tsc = 0;
+        kvm_track_tsc_matching(vcpu);
+        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+static cycle_t read_tsc(void)
+{
+        cycle_t ret;
+        u64 last;
+        /*
+         * Empirically, a fence (of type that depends on the CPU)
+         * before rdtsc is enough to ensure that rdtsc is ordered
+         * with respect to loads.  The various CPU manuals are unclear
+         * as to whether rdtsc can be reordered with later loads,
+         * but no one has ever seen it happen.
+         */
+        rdtsc_barrier();
+        ret = (cycle_t)vget_cycles();
+        last = pvclock_gtod_data.clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        /*
+         * GCC likes to generate cmov here, but this branch is extremely
+         * predictable (it's just a funciton of time and the likely is
+         * very likely) and there's a data dependence, so force GCC
+         * to generate a branch instead.  I don't barrier() because
+         * we don't actually need a barrier, and if this function
+         * ever gets inlined it will generate worse code.
+         */
+        asm volatile ("");
+        return last;
+}
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+        long v;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        *cycle_now = read_tsc();
+        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+        return v * gtod->clock.mult;
+}
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+        unsigned long seq;
+        u64 ns;
+        int mode;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        ts->tv_nsec = 0;
+        do {
+                seq = read_seqcount_begin(&gtod->seq);
+                mode = gtod->clock.vclock_mode;
+                ts->tv_sec = gtod->monotonic_time_sec;
+                ns = gtod->monotonic_time_snsec;
+                ns += vgettsc(cycle_now);
+                ns >>= gtod->clock.shift;
+        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+        timespec_add_ns(ts, ns);
+        return mode;
+}
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+        struct timespec ts;
+        /* checked again under seqlock below */
+        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+                return false;
+        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+                return false;
+        monotonic_to_bootbased(&ts);
+        *kernel_ns = timespec_to_ns(&ts);
+        return true;
+}
+#endif
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ *              VCPU0 on CPU0           |       VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.                                   | timespec1 = timespec0 + N
+ *                                      | tsc1 = tsc0 + M
+ * 3. transition to guest               | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
+ *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ *      - ret0 < ret1
+ *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *              ...
+ *      - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        struct kvm_arch *ka = &kvm->arch;
+        int vclock_mode;
+        bool host_tsc_clocksource, vcpus_matched;
+        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                        atomic_read(&kvm->online_vcpus));
+        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        host_tsc_clocksource = kvm_get_time_and_clockread(
+                                        &ka->master_kernel_ns,
+                                        &ka->master_cycle_now);
+        ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+        if (ka->use_master_clock)
+                atomic_set(&kvm_guest_has_master_clock, 1);
+        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+                                        vcpus_matched);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-        unsigned long flags;
+        unsigned long flags, this_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
+        struct kvm_arch *ka = &v->kvm->arch;
        void *shared_kaddr;
-        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
-        u64 tsc_timestamp;
+        u64 tsc_timestamp, host_tsc;
+        struct pvclock_vcpu_time_info *guest_hv_clock;
        u8 pvclock_flags;
+        bool use_master_clock;
+        kernel_ns = 0;
+        host_tsc = 0;
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
-        kernel_ns = get_kernel_ns();
        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        }
        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        use_master_clock = ka->use_master_clock;
+        if (use_master_clock) {
+                host_tsc = ka->master_cycle_now;
+                kernel_ns = ka->master_kernel_ns;
+        }
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+        if (!use_master_clock) {
+                host_tsc = native_read_tsc();
+                kernel_ns = get_kernel_ns();
+        }
+        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+        /*
         * We may have to catch up the TSC to match elapsed wall clock
         * time for two reasons, even if kvmclock is used.
         *   1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = this_tsc_khz;
        }
-        if (max_kernel_ns > kernel_ns)
+        /* with a master <monotonic time, tsc value> tuple,
-                kernel_ns = max_kernel_ns;
+         * pvclock clock reads always increase at the (scaled) rate
+         * of guest TSC - no need to deal with sampling errors.
+         */
+        if (!use_master_clock) {
+                if (max_kernel_ns > kernel_ns)
+                        kernel_ns = max_kernel_ns;
+        }
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
-        pvclock_flags = 0;
-        if (vcpu->pvclock_set_guest_stopped_request) {
-                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-                vcpu->pvclock_set_guest_stopped_request = false;
-        }
-        vcpu->hv_clock.flags = pvclock_flags;
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        shared_kaddr = kmap_atomic(vcpu->time_page);
+        guest_hv_clock = shared_kaddr + vcpu->time_offset;
+        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+        pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+        if (vcpu->pvclock_set_guest_stopped_request) {
+                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+                vcpu->pvclock_set_guest_stopped_request = false;
+        }
+        /* If the host uses TSC clocksource, then it is stable */
+        if (use_master_clock)
+                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+        vcpu->hv_clock.flags = pvclock_flags;
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
               sizeof(vcpu->hv_clock));
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        bool pr = false;
+        u32 msr = msr_info->index;
+        u64 data = msr_info->data;
        switch (msr) {
        case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_TSCDEADLINE:
                kvm_set_lapic_tscdeadline_msr(vcpu, data);
                break;
+        case MSR_IA32_TSC_ADJUST:
+                if (guest_cpuid_has_tsc_adjust(vcpu)) {
+                        if (!msr_info->host_initiated) {
+                                u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+                                kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+                        }
+                        vcpu->arch.ia32_tsc_adjust_msr = data;
+                }
+                break;
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_TSCDEADLINE:
                data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
+        case MSR_IA32_TSC_ADJUST:
+                data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+                break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
-                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+                /*
+                 * On a host with synchronized TSC, there is no need to update
+                 * kvmclock on vcpu->cpu migration
+                 */
+                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                if (vcpu->cpu != cpu)
                        kvm_migrate_timers(vcpu);
                vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (!vcpu->arch.apic)
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
-                if (IS_ERR(u.lapic)) {
+                if (IS_ERR(u.lapic))
-                        r = PTR_ERR(u.lapic);
+                        return PTR_ERR(u.lapic);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&irq, argp, sizeof irq))
                        goto out;
                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_NMI: {
                r = kvm_vcpu_ioctl_nmi(vcpu);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
                                              cpuid_arg->entries);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XSAVE: {
                u.xsave = memdup_user(argp, sizeof(*u.xsave));
-                if (IS_ERR(u.xsave)) {
+                if (IS_ERR(u.xsave))
-                        r = PTR_ERR(u.xsave);
+                        return PTR_ERR(u.xsave);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XCRS: {
                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-                if (IS_ERR(u.xcrs)) {
+                if (IS_ERR(u.xcrs))
-                        r = PTR_ERR(u.xcrs);
+                        return PTR_ERR(u.xcrs);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
                break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
        int ret;
        if (addr > (unsigned int)(-3 * PAGE_SIZE))
-                return -1;
+                return -EINVAL;
        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
        return ret;
 }
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_SET_TSS_ADDR:
                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-                if (r < 0)
-                        goto out;
                break;
        case KVM_SET_IDENTITY_MAP_ADDR: {
                u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
                        goto out;
                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-                if (r < 0)
-                        goto out;
                break;
        }
        case KVM_SET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-                if (r)
-                        goto out;
                break;
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        get_irqchip_out:
                kfree(chip);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        set_irqchip_out:
                kfree(chip);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&control, argp, sizeof(control)))
                        goto out;
                r = kvm_vm_ioctl_reinject(kvm, &control);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
                            u32 msr_index, u64 data)
 {
-        return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+        struct msr_data msr;
+        msr.data = data;
+        msr.index = msr_index;
+        msr.host_initiated = false;
+        return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
         * instruction -> ...
         */
        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-        if (!is_error_pfn(pfn)) {
+        if (!is_error_noslot_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return true;
        }
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
        kvm_mmu_set_mmio_spte_mask(mask);
 }
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+        struct kvm *kvm;
+        struct kvm_vcpu *vcpu;
+        int i;
+        raw_spin_lock(&kvm_lock);
+        list_for_each_entry(kvm, &vm_list, vm_list)
+                kvm_for_each_vcpu(i, vcpu, kvm)
+                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+        atomic_set(&kvm_guest_has_master_clock, 0);
+        raw_spin_unlock(&kvm_lock);
+}
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+                               void *priv)
+{
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        struct timekeeper *tk = priv;
+        update_pvclock_gtod(tk);
+        /* disable master clock if host does not trust, or does not
+         * use, TSC clocksource
+         */
+        if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+            atomic_read(&kvm_guest_has_master_clock) != 0)
+                queue_work(system_long_wq, &pvclock_gtod_work);
+        return 0;
+}
+static struct notifier_block pvclock_gtod_notifier = {
+        .notifier_call = pvclock_gtod_notify,
+};
+#endif
 int kvm_arch_init(void *opaque)
 {
        int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
        kvm_lapic_init();
+#ifdef CONFIG_X86_64
+        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
        return 0;
 out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
        unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
        kvm_x86_ops = NULL;
        kvm_mmu_module_exit();
 }
@@ -5059,7 +5396,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_mmu_unload(vcpu);
                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
+                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+                        kvm_gen_update_masterclock(vcpu->kvm);
                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
                        r = kvm_guest_time_update(vcpu);
                        if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
-        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+                                                           native_read_tsc());
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-                r = kvm_arch_vcpu_reset(vcpu);
+                r = kvm_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-        r = kvm_arch_vcpu_reset(vcpu);
+        r = kvm_vcpu_reset(vcpu);
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        return r;
 }
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+        int r;
+        struct msr_data msr;
+        r = vcpu_load(vcpu);
+        if (r)
+                return r;
+        msr.data = 0x0;
+        msr.index = MSR_IA32_TSC;
+        msr.host_initiated = true;
+        kvm_write_tsc(vcpu, &msr);
+        vcpu_put(vcpu);
+        return r;
+}
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
        kvm_pmu_reset(vcpu);
+        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+        vcpu->arch.regs_avail = ~0;
+        vcpu->arch.regs_dirty = ~0;
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
                        kvm_for_each_vcpu(i, vcpu, kvm) {
                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
                                vcpu->arch.last_host_tsc = local_tsc;
+                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                        &vcpu->requests);
                        }
                        /*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
                goto fail_free_mce_banks;
+        r = fx_init(vcpu);
+        if (r)
+                goto fail_free_wbinvd_dirty_mask;
+        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
        return 0;
+fail_free_wbinvd_dirty_mask:
+        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
+        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+        pvclock_update_vm_gtod_copy(kvm);
        return 0;
 }