Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Marcelo Tosatti: "Considerable KVM/PPC work, x86 kvmclock vsyscall support, IA32_TSC_ADJUST MSR emulation, amongst others." Fix up trivial conflict in kernel/sched/core.c due to cross-cpu migration notifier added next to rq migration call-back. * tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits) KVM: emulator: fix real mode segment checks in address linearization VMX: remove unneeded enable_unrestricted_guest check KVM: VMX: fix DPL during entry to protected mode x86/kexec: crash_vmclear_local_vmcss needs __rcu kvm: Fix irqfd resampler list walk KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary KVM: MMU: optimize for set_spte KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation KVM: PPC: bookehv: Add guest computation mode for irq delivery KVM: PPC: Make EPCR a valid field for booke64 and bookehv KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation KVM: PPC: e500: Add emulation helper for getting instruction ea KVM: PPC: bookehv64: Add support for interrupt handling KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler KVM: PPC: booke: Fix get_tb() compile error on 64-bit KVM: PPC: e500: Silence bogus GCC warning in tlb code ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-13 18:31:08 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-13 18:31:08 -0500
commit: 66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree: 4892eaa422d366fce5d1e866ff1fe0988af95569 /arch/x86
parent: 896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent: 58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)
27 files changed, 1209 insertions, 339 deletions
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce7..16a57f4ed64d 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
 #define VCLOCK_NONE 0  /* No vDSO clock available.      */
 #define VCLOCK_TSC  1  /* vDSO should use vread_tsc.    */
 #define VCLOCK_HPET 2  /* vDSO should use vread_hpet.   */
+#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
 struct arch_clocksource_data {
        int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228e..2d9075e863a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE    (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST  (9*32+ 1) /* TSC adjustment MSR 0x3b */
 #define X86_FEATURE_BMI1        (9*32+ 3) /* 1st group bit manipulation extensions */
 #define X86_FEATURE_HLE         (9*32+ 4) /* Hardware Lock Elision */
 #define X86_FEATURE_AVX2        (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c974..a09c28571064 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
 #include <asm/acpi.h>
 #include <asm/apicdef.h>
 #include <asm/page.h>
+#include <asm/pvclock.h>
 #ifdef CONFIG_X86_32
 #include <linux/threads.h>
 #include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
        VVAR_PAGE,
        VSYSCALL_HPET,
 #endif
+#ifdef CONFIG_PARAVIRT_CLOCK
+        PVCLOCK_FIXMAP_BEGIN,
+        PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
+#endif
        FIX_DBGP_BASE,
        FIX_EARLYCON_MEM_BASE,
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..6080d2694bad 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
 };
 #endif
+typedef void crash_vmclear_fn(void);
+extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..a92b1763c419
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_GUEST_H
+#define _ASM_X86_KVM_GUEST_H
+int kvm_setup_vsyscall_timeinfo(void);
+#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f452435..dc87b65e9c3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
 #include <linux/kvm_para.h>
 #include <linux/kvm_types.h>
 #include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
        s8 virtual_tsc_shift;
        u32 virtual_tsc_mult;
        u32 virtual_tsc_khz;
+        s64 ia32_tsc_adjust_msr;
        atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
        unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
        u64 cur_tsc_write;
        u64 cur_tsc_offset;
        u8  cur_tsc_generation;
+        int nr_vcpus_matched_tsc;
+        spinlock_t pvclock_gtod_sync_lock;
+        bool use_master_clock;
+        u64 master_kernel_ns;
+        cycle_t master_cycle_now;
        struct kvm_xen_hvm_config xen_hvm_config;
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
 struct x86_instruction_info;
+struct msr_data {
+        bool host_initiated;
+        u32 index;
+        u64 data;
+};
 struct kvm_x86_ops {
        int (*cpu_has_kvm_support)(void);          /* __init */
        int (*disabled_by_bios)(void);             /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
        void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
        int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
-        int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+        int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
        u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
        void (*get_segment)(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
        bool (*has_wbinvd_exit)(void);
        void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
+        u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
        void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
        u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
-        u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
+        u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
        void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 struct x86_emulate_ctxt;
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd65..6e930b218724 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
 #define MSR_IA32_EBL_CR_POWERON         0x0000002a
 #define MSR_EBC_FREQUENCY_ID            0x0000002c
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
+#define MSR_IA32_TSC_ADJUST             0x0000003b
 #define FEATURE_CONTROL_LOCKED                          (1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX        (1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c1..109a9dd5d454 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
 void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
        return product;
 }
+static __always_inline
+u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
+{
+        u64 delta = __native_read_tsc() - src->tsc_timestamp;
+        return pvclock_scale_delta(delta, src->tsc_to_system_mul,
+                                   src->tsc_shift);
+}
+static __always_inline
+unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+                               cycle_t *cycles, u8 *flags)
+{
+        unsigned version;
+        cycle_t ret, offset;
+        u8 ret_flags;
+        version = src->version;
+        /* Note: emulated platforms which do not advertise SSE2 support
+         * result in kvmclock not using the necessary RDTSC barriers.
+         * Without barriers, it is possible that RDTSC instruction reads from
+         * the time stamp counter outside rdtsc_barrier protected section
+         * below, resulting in violation of monotonicity.
+         */
+        rdtsc_barrier();
+        offset = pvclock_get_nsec_offset(src);
+        ret = src->system_time + offset;
+        ret_flags = src->flags;
+        rdtsc_barrier();
+        *cycles = ret;
+        *flags = ret_flags;
+        return version;
+}
+struct pvclock_vsyscall_time_info {
+        struct pvclock_vcpu_time_info pvti;
+        u32 migrate_count;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+                                 int size);
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
 #endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d68..c2d56b34830d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
 #define VMX_EPTP_WB_BIT                         (1ull << 14)
 #define VMX_EPT_2MB_PAGE_BIT                    (1ull << 16)
 #define VMX_EPT_1GB_PAGE_BIT                    (1ull << 17)
-#define VMX_EPT_AD_BIT                                  (1ull << 21)
+#define VMX_EPT_AD_BIT                              (1ull << 21)
-#define VMX_EPT_EXTENT_INDIVIDUAL_BIT           (1ull << 24)
 #define VMX_EPT_EXTENT_CONTEXT_BIT              (1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT               (1ull << 26)
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f753..80f80955cfd8 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
 */
 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+#ifdef CONFIG_X86_64
+#define VGETCPU_CPU_MASK 0xfff
+static inline unsigned int __getcpu(void)
+{
+        unsigned int p;
+        if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
+                /* Load per CPU data from RDTSCP */
+                native_read_tscp(&p);
+        } else {
+                /* Load per CPU data from GDT */
+                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+        }
+        return p;
+}
+#endif /* CONFIG_X86_64 */
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
+#include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/hardirq.h>
@@ -30,6 +31,27 @@
 int in_crash_kexec;
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+        crash_vmclear_fn *do_vmclear_operation = NULL;
+        rcu_read_lock();
+        do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+        if (do_vmclear_operation)
+                do_vmclear_operation();
+        rcu_read_unlock();
+}
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
 static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 #endif
        crash_save_cpu(regs, cpu);
+        /*
+         * VMCLEAR VMCSs loaded on all cpus if needed.
+         */
+        cpu_crash_vmclear_loaded_vmcss();
        /* Disable VMX or SVM if needed.
         *
         * We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
        kdump_nmi_shootdown_cpus();
+        /*
+         * VMCLEAR VMCSs loaded on this cpu if needed.
+         */
+        cpu_crash_vmclear_loaded_vmcss();
        /* Booting kdump kernel with VMX or SVM enabled won't work,
         * because (among other limitations) we can't disable paging
         * with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..08b973f64032 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
 #include <asm/apic.h>
 #include <asm/apicdef.h>
 #include <asm/hypervisor.h>
+#include <asm/kvm_guest.h>
 static int kvmapf = 1;
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
 early_param("no-steal-acc", parse_no_stealacc);
+static int kvmclock_vsyscall = 1;
+static int parse_no_kvmclock_vsyscall(char *arg)
+{
+        kvmclock_vsyscall = 0;
+        return 0;
+}
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
 static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
        struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
        struct kvm_task_sleep_node n, *e;
        DEFINE_WAIT(wait);
-        int cpu, idle;
-        cpu = get_cpu();
-        idle = idle_cpu(cpu);
-        put_cpu();
        spin_lock(&b->lock);
        e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
        n.token = token;
        n.cpu = smp_processor_id();
-        n.halted = idle || preempt_count() > 1;
+        n.halted = is_idle_task(current) || preempt_count() > 1;
        init_waitqueue_head(&n.wq);
        hlist_add_head(&n.link, &b->list);
        spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
        if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
                apic_set_eoi_write(kvm_guest_apic_eoi_write);
+        if (kvmclock_vsyscall)
+                kvm_setup_vsyscall_timeinfo();
 #ifdef CONFIG_SMP
        smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
        register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
 #include <asm/apic.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/memblock.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
 early_param("no-kvmclock", parse_no_kvmclock);
 /* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 /*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
        struct pvclock_vcpu_time_info *vcpu_time;
        struct timespec ts;
        int low, high;
+        int cpu;
        low = (int)__pa_symbol(&wall_clock);
        high = ((u64)__pa_symbol(&wall_clock) >> 32);
        native_write_msr(msr_kvm_wall_clock, low, high);
-        vcpu_time = &get_cpu_var(hv_clock);
+        preempt_disable();
+        cpu = smp_processor_id();
+        vcpu_time = &hv_clock[cpu].pvti;
        pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
-        put_cpu_var(hv_clock);
+        preempt_enable();
        return ts.tv_sec;
 }
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
 {
        struct pvclock_vcpu_time_info *src;
        cycle_t ret;
+        int cpu;
        preempt_disable_notrace();
-        src = &__get_cpu_var(hv_clock);
+        cpu = smp_processor_id();
+        src = &hv_clock[cpu].pvti;
        ret = pvclock_clocksource_read(src);
        preempt_enable_notrace();
        return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 static unsigned long kvm_get_tsc_khz(void)
 {
        struct pvclock_vcpu_time_info *src;
-        src = &per_cpu(hv_clock, 0);
+        int cpu;
-        return pvclock_tsc_khz(src);
+        unsigned long tsc_khz;
+        preempt_disable();
+        cpu = smp_processor_id();
+        src = &hv_clock[cpu].pvti;
+        tsc_khz = pvclock_tsc_khz(src);
+        preempt_enable();
+        return tsc_khz;
 }
 static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
 {
        bool ret = false;
        struct pvclock_vcpu_time_info *src;
+        int cpu = smp_processor_id();
-        src = &__get_cpu_var(hv_clock);
+        if (!hv_clock)
+                return ret;
+        src = &hv_clock[cpu].pvti;
        if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
-                __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+                src->flags &= ~PVCLOCK_GUEST_STOPPED;
                ret = true;
        }
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
 {
        int cpu = smp_processor_id();
        int low, high, ret;
+        struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
-        low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+        low = (int)__pa(src) | 1;
-        high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+        high = ((u64)__pa(src) >> 32);
        ret = native_write_msr_safe(msr_kvm_system_time, low, high);
        printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
               cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
 void __init kvmclock_init(void)
 {
+        unsigned long mem;
        if (!kvm_para_available())
                return;
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
        printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
                msr_kvm_system_time, msr_kvm_wall_clock);
-        if (kvm_register_clock("boot clock"))
+        mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
+                             PAGE_SIZE);
+        if (!mem)
+                return;
+        hv_clock = __va(mem);
+        if (kvm_register_clock("boot clock")) {
+                hv_clock = NULL;
+                memblock_free(mem,
+                        sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
                return;
+        }
        pv_time_ops.sched_clock = kvm_clock_read;
        x86_platform.calibrate_tsc = kvm_get_tsc_khz;
        x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
        if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
                pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
+int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+        int cpu;
+        int ret;
+        u8 flags;
+        struct pvclock_vcpu_time_info *vcpu_time;
+        unsigned int size;
+        size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+        preempt_disable();
+        cpu = smp_processor_id();
+        vcpu_time = &hv_clock[cpu].pvti;
+        flags = pvclock_read_flags(vcpu_time);
+        if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
+                preempt_enable();
+                return 1;
+        }
+        if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
+                preempt_enable();
+                return ret;
+        }
+        preempt_enable();
+        kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+        return 0;
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
 #include <linux/kernel.h>
 #include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <asm/fixmap.h>
 #include <asm/pvclock.h>
-/*
- * These are perodically updated
- *    xen: magic shared_info page
- *    kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
-        u64 tsc_timestamp;     /* TSC at last update of time vals.  */
-        u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
-        u32 tsc_to_nsec_mul;
-        int tsc_shift;
-        u32 version;
-        u8  flags;
-};
 static u8 valid_flags __read_mostly = 0;
 void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
        valid_flags = flags;
 }
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
-        u64 delta = native_read_tsc() - shadow->tsc_timestamp;
-        return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
-                                   shadow->tsc_shift);
-}
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
-                                        struct pvclock_vcpu_time_info *src)
-{
-        do {
-                dst->version = src->version;
-                rmb();          /* fetch version before data */
-                dst->tsc_timestamp     = src->tsc_timestamp;
-                dst->system_timestamp  = src->system_time;
-                dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
-                dst->tsc_shift         = src->tsc_shift;
-                dst->flags             = src->flags;
-                rmb();          /* test version after fetching data */
-        } while ((src->version & 1) || (dst->version != src->version));
-        return dst->version;
-}
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 {
        u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
        atomic64_set(&last_value, 0);
 }
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+        unsigned version;
+        cycle_t ret;
+        u8 flags;
+        do {
+                version = __pvclock_read_cycles(src, &ret, &flags);
+        } while ((src->version & 1) || version != src->version);
+        return flags & valid_flags;
+}
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
-        struct pvclock_shadow_time shadow;
        unsigned version;
-        cycle_t ret, offset;
+        cycle_t ret;
        u64 last;
+        u8 flags;
        do {
-                version = pvclock_get_time_values(&shadow, src);
+                version = __pvclock_read_cycles(src, &ret, &flags);
-                barrier();
+        } while ((src->version & 1) || version != src->version);
-                offset = pvclock_get_nsec_offset(&shadow);
-                ret = shadow.system_timestamp + offset;
-                barrier();
-        } while (version != src->version);
        if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
-                (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+                (flags & PVCLOCK_TSC_STABLE_BIT))
                return ret;
        /*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
        set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+        if (!pvclock_vdso_info) {
+                BUG();
+                return NULL;
+        }
+        return &pvclock_vdso_info[cpu];
+}
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+        return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+#ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+                                void *v)
+{
+        struct task_migration_notifier *mn = v;
+        struct pvclock_vsyscall_time_info *pvti;
+        pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+        /* this is NULL when pvclock vsyscall is not initialized */
+        if (unlikely(pvti == NULL))
+                return NOTIFY_DONE;
+        pvti->migrate_count++;
+        return NOTIFY_DONE;
+}
+static struct notifier_block pvclock_migrate = {
+        .notifier_call = pvclock_task_migrate,
+};
+/*
+ * Initialize the generic pvclock vsyscall state.  This will allocate
+ * a/some page(s) for the per-vcpu pvclock information, set up a
+ * fixmap mapping for the page(s)
+ */
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+                                 int size)
+{
+        int idx;
+        WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+        pvclock_vdso_info = i;
+        for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
+                __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
+                             __pa_symbol(i) + (idx*PAGE_SIZE),
+                             PAGE_KERNEL_VVAR);
+        }
+        register_task_migration_notifier(&pvclock_migrate);
+        return 0;
+}
+#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342e..a20ecb5b6cbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
                if (index == 0) {
                        entry->ebx &= kvm_supported_word9_x86_features;
                        cpuid_mask(&entry->ebx, 9);
+                        // TSC_ADJUST is emulated
+                        entry->ebx |= F(TSC_ADJUST);
                } else
                        entry->ebx = 0;
                entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
        } else
                *eax = *ebx = *ecx = *edx = 0;
 }
+EXPORT_SYMBOL_GPL(kvm_cpuid);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc51488828..b7fd07984888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
        return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
+static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
+{
+        struct kvm_cpuid_entry2 *best;
+        best = kvm_find_cpuid_entry(vcpu, 7, 0);
+        return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
+}
 static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4b..a27e76371108 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
                                                addr.seg);
                if (!usable)
                        goto bad;
-                /* code segment or read-only data segment */
+                /* code segment in protected mode or read-only data segment */
-                if (((desc.type & 8) || !(desc.type & 2)) && write)
+                if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+                                        || !(desc.type & 2)) && write)
                        goto bad;
                /* unreadable code segment */
                if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d0..9392f527f107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
                local_irq_save(flags);
                now = apic->lapic_timer.timer.base->get_time();
-                guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+                guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
                if (likely(tscdeadline > guest_tsc)) {
                        ns = (tscdeadline - guest_tsc) * 1000000ULL;
                        do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf958..01d7c2ad05f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
            || (!vcpu->arch.mmu.direct_map && write_fault
                && !is_write_protection(vcpu) && !user_fault)) {
+                /*
+                 * There are two cases:
+                 * - the one is other vcpu creates new sp in the window
+                 *   between mapping_level() and acquiring mmu-lock.
+                 * - the another case is the new sp is created by itself
+                 *   (page-fault path) when guest uses the target gfn as
+                 *   its page table.
+                 * Both of these cases can be fixed by allowing guest to
+                 * retry the access, it will refault, then we can establish
+                 * the mapping by using small page.
+                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
-                    has_wrprotected_page(vcpu->kvm, gfn, level)) {
+                    has_wrprotected_page(vcpu->kvm, gfn, level))
-                        ret = 1;
-                        drop_spte(vcpu->kvm, sptep);
                        goto done;
-                }
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
        mmu_free_roots(vcpu);
 }
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+        int bit7;
+        bit7 = (gpte >> 7) & 1;
+        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
        return gfn_to_pfn_memslot_atomic(slot, gfn);
 }
+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+                                  struct kvm_mmu_page *sp, u64 *spte,
+                                  u64 gpte)
+{
+        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+                goto no_present;
+        if (!is_present_gpte(gpte))
+                goto no_present;
+        if (!(gpte & PT_ACCESSED_MASK))
+                goto no_present;
+        return false;
+no_present:
+        drop_spte(vcpu->kvm, spte);
+        return true;
+}
 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                                    struct kvm_mmu_page *sp,
                                    u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
         * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
         * here.
         */
-        if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+        if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
            !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
        }
 }
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
-        return unlikely(is_invalid_pfn(pfn));
-}
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                pfn_t pfn, unsigned access, int *ret_val)
 {
        bool ret = true;
        /* The pfn is invalid, report the error! */
-        if (unlikely(is_invalid_pfn(pfn))) {
+        if (unlikely(is_error_pfn(pfn))) {
                *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
                goto exit;
        }
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
                return r;
        spin_lock(&vcpu->kvm->mmu_lock);
-        if (mmu_notifier_retry(vcpu, mmu_seq))
+        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
                return r;
        spin_lock(&vcpu->kvm->mmu_lock);
-        if (mmu_notifier_retry(vcpu, mmu_seq))
+        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
        nonpaging_free(vcpu);
 }
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-        int bit7;
-        bit7 = (gpte >> 7) & 1;
-        return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
 static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
 {
        unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6fe..891eb6d93b8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
                                        addr, access);
 }
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+static bool
-                                    struct kvm_mmu_page *sp, u64 *spte,
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                                    pt_element_t gpte)
+                     u64 *spte, pt_element_t gpte, bool no_dirty_log)
 {
-        if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
-                goto no_present;
-        if (!is_present_gpte(gpte))
-                goto no_present;
-        if (!(gpte & PT_ACCESSED_MASK))
-                goto no_present;
-        return false;
-no_present:
-        drop_spte(vcpu->kvm, spte);
-        return true;
-}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-                              u64 *spte, const void *pte)
-{
-        pt_element_t gpte;
        unsigned pte_access;
+        gfn_t gfn;
        pfn_t pfn;
-        gpte = *(const pt_element_t *)pte;
+        if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
-        if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
+                return false;
-                return;
        pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+        gfn = gpte_to_gfn(gpte);
        pte_access = sp->role.access & gpte_access(vcpu, gpte);
        protect_clean_gpte(&pte_access, gpte);
-        pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
+        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-        if (mmu_invalid_pfn(pfn))
+                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
-                return;
+        if (is_error_pfn(pfn))
+                return false;
        /*
-         * we call mmu_set_spte() with host_writable = true because that
+         * we call mmu_set_spte() with host_writable = true because
-         * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+         * pte_prefetch_gfn_to_pfn always gets a writable pfn.
         */
        mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-                     NULL, PT_PAGE_TABLE_LEVEL,
+                     NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
-                     gpte_to_gfn(gpte), pfn, true, true);
+        return true;
+}
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+                              u64 *spte, const void *pte)
+{
+        pt_element_t gpte = *(const pt_element_t *)pte;
+        FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
 }
 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
        spte = sp->spt + i;
        for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-                pt_element_t gpte;
-                unsigned pte_access;
-                gfn_t gfn;
-                pfn_t pfn;
                if (spte == sptep)
                        continue;
                if (is_shadow_present_pte(*spte))
                        continue;
-                gpte = gptep[i];
+                if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
-                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
-                        continue;
-                pte_access = sp->role.access & gpte_access(vcpu, gpte);
-                protect_clean_gpte(&pte_access, gpte);
-                gfn = gpte_to_gfn(gpte);
-                pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-                                      pte_access & ACC_WRITE_MASK);
-                if (mmu_invalid_pfn(pfn))
                        break;
-                mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-                             NULL, PT_PAGE_TABLE_LEVEL, gfn,
-                             pfn, true, true);
        }
 }
 /*
 * Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
 */
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
                         struct guest_walker *gw,
                         int user_fault, int write_fault, int hlevel,
-                         int *emulate, pfn_t pfn, bool map_writable,
+                         pfn_t pfn, bool map_writable, bool prefault)
-                         bool prefault)
 {
-        unsigned access = gw->pt_access;
        struct kvm_mmu_page *sp = NULL;
-        int top_level;
-        unsigned direct_access;
        struct kvm_shadow_walk_iterator it;
+        unsigned direct_access, access = gw->pt_access;
+        int top_level, emulate = 0;
        if (!is_present_gpte(gw->ptes[gw->level - 1]))
-                return NULL;
+                return 0;
        direct_access = gw->pte_access;
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
        clear_sp_write_flooding_count(it.sptep);
        mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
-                     user_fault, write_fault, emulate, it.level,
+                     user_fault, write_fault, &emulate, it.level,
                     gw->gfn, pfn, prefault, map_writable);
        FNAME(pte_prefetch)(vcpu, gw, it.sptep);
-        return it.sptep;
+        return emulate;
 out_gpte_changed:
        if (sp)
                kvm_mmu_put_page(sp, it.sptep);
        kvm_release_pfn_clean(pfn);
-        return NULL;
+        return 0;
 }
 /*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
        int write_fault = error_code & PFERR_WRITE_MASK;
        int user_fault = error_code & PFERR_USER_MASK;
        struct guest_walker walker;
-        u64 *sptep;
-        int emulate = 0;
        int r;
        pfn_t pfn;
        int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
                return r;
        spin_lock(&vcpu->kvm->mmu_lock);
-        if (mmu_notifier_retry(vcpu, mmu_seq))
+        if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
                goto out_unlock;
        kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
        kvm_mmu_free_some_pages(vcpu);
        if (!force_pt_level)
                transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
-        sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+        r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-                             level, &emulate, pfn, map_writable, prefault);
+                         level, pfn, map_writable, prefault);
-        (void)sptep;
-        pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
-                 sptep, *sptep, emulate);
        ++vcpu->stat.pf_fixed;
        kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
        spin_unlock(&vcpu->kvm->mmu_lock);
-        return emulate;
+        return r;
 out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                                          sizeof(pt_element_t)))
                        return -EINVAL;
-                if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+                if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
                        vcpu->kvm->tlbs_dirty++;
                        continue;
                }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899ef..d29d3cd1c156 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
 #include "mmu.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
+#include "cpuid.h"
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
                return -EBUSY;
        if (!has_svm()) {
-                printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
+                pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
-                       me);
                return -EINVAL;
        }
        sd = per_cpu(svm_data, me);
        if (!sd) {
-                printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
+                pr_err("%s: svm_data is NULL on %d\n", __func__, me);
-                       me);
                return -EINVAL;
        }
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
        svm->tsc_ratio             = ratio;
 }
+static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_svm *svm = to_svm(vcpu);
+        return svm->vmcb->control.tsc_offset;
+}
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+        u32 dummy;
+        u32 eax = 1;
        init_vmcb(svm);
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
                svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
                svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
        }
-        vcpu->arch.regs_avail = ~0;
-        vcpu->arch.regs_dirty = ~0;
+        kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+        kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
        return 0;
 }
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
        svm->asid_generation = 0;
        init_vmcb(svm);
-        kvm_write_tsc(&svm->vcpu, 0);
-        err = fx_init(&svm->vcpu);
-        if (err)
-                goto free_page4;
        svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
        if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
        return &svm->vcpu;
-free_page4:
-        __free_page(hsave_page);
 free_page3:
        __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
        return 0;
 }
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
        struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
        return vmcb->control.tsc_offset +
-                svm_scale_tsc(vcpu, native_read_tsc());
+                svm_scale_tsc(vcpu, host_tsc);
 }
 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
        return 0;
 }
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+        u32 ecx = msr->index;
+        u64 data = msr->data;
        switch (ecx) {
        case MSR_IA32_TSC:
-                kvm_write_tsc(vcpu, data);
+                kvm_write_tsc(vcpu, msr);
                break;
        case MSR_STAR:
                svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
                vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
                break;
        default:
-                return kvm_set_msr_common(vcpu, ecx, data);
+                return kvm_set_msr_common(vcpu, msr);
        }
        return 0;
 }
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
+        struct msr_data msr;
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+        msr.data = data;
+        msr.index = ecx;
+        msr.host_initiated = false;
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
-        if (svm_set_msr(&svm->vcpu, ecx, data)) {
+        if (svm_set_msr(&svm->vcpu, &msr)) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(&svm->vcpu, 0);
        } else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
        .has_wbinvd_exit = svm_has_wbinvd_exit,
        .set_tsc_khz = svm_set_tsc_khz,
+        .read_tsc_offset = svm_read_tsc_offset,
        .write_tsc_offset = svm_write_tsc_offset,
        .adjust_tsc_offset = svm_adjust_tsc_offset,
        .compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..fe5e00ed7036 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
 #include <linux/tracepoint.h>
 #include <asm/vmx.h>
 #include <asm/svm.h>
+#include <asm/clocksource.h>
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
                  __entry->write ? "Write" : "Read",
                  __entry->gpa_match ? "GPA" : "GVA")
 );
+#ifdef CONFIG_X86_64
+#define host_clocks                                     \
+        {VCLOCK_NONE, "none"},                          \
+        {VCLOCK_TSC,  "tsc"},                           \
+        {VCLOCK_HPET, "hpet"}                           \
+TRACE_EVENT(kvm_update_master_clock,
+        TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+        TP_ARGS(use_master_clock, host_clock, offset_matched),
+        TP_STRUCT__entry(
+                __field(                bool,   use_master_clock        )
+                __field(        unsigned int,   host_clock              )
+                __field(                bool,   offset_matched          )
+        ),
+        TP_fast_assign(
+                __entry->use_master_clock       = use_master_clock;
+                __entry->host_clock             = host_clock;
+                __entry->offset_matched         = offset_matched;
+        ),
+        TP_printk("masterclock %d hostclock %s offsetmatched %u",
+                  __entry->use_master_clock,
+                  __print_symbolic(__entry->host_clock, host_clocks),
+                  __entry->offset_matched)
+);
+TRACE_EVENT(kvm_track_tsc,
+        TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+                 unsigned int online_vcpus, bool use_master_clock,
+                 unsigned int host_clock),
+        TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+                host_clock),
+        TP_STRUCT__entry(
+                __field(        unsigned int,   vcpu_id                 )
+                __field(        unsigned int,   nr_vcpus_matched_tsc    )
+                __field(        unsigned int,   online_vcpus            )
+                __field(        bool,           use_master_clock        )
+                __field(        unsigned int,   host_clock              )
+        ),
+        TP_fast_assign(
+                __entry->vcpu_id                = vcpu_id;
+                __entry->nr_vcpus_matched_tsc   = nr_matched;
+                __entry->online_vcpus           = online_vcpus;
+                __entry->use_master_clock       = use_master_clock;
+                __entry->host_clock             = host_clock;
+        ),
+        TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+                  " hostclock %s",
+                  __entry->vcpu_id, __entry->use_master_clock,
+                  __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+                  __print_symbolic(__entry->host_clock, host_clocks))
+);
+#endif /* CONFIG_X86_64 */
 #endif /* _TRACE_KVM_H */
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/perf_event.h>
+#include <asm/kexec.h>
 #include "trace.h"
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
        return vmx_capability.ept & VMX_EPT_AD_BIT;
 }
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
-        return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
 static inline bool cpu_has_vmx_invept_context(void)
 {
        return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+static inline void crash_enable_local_vmclear(int cpu)
+{
+        cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static inline void crash_disable_local_vmclear(int cpu)
+{
+        cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+        return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+        int cpu = raw_smp_processor_id();
+        struct loaded_vmcs *v;
+        if (!crash_local_vmclear_enabled(cpu))
+                return;
+        list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                            loaded_vmcss_on_cpu_link)
+                vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
 static void __loaded_vmcs_clear(void *arg)
 {
        struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
+        crash_disable_local_vmclear(cpu);
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+        /*
+         * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+         * is before setting loaded_vmcs->vcpu to -1 which is done in
+         * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+         * then adds the vmcs into percpu list before it is deleted.
+         */
+        smp_wmb();
        loaded_vmcs_init(loaded_vmcs);
+        crash_enable_local_vmclear(cpu);
 }
 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-        if (loaded_vmcs->cpu != -1)
+        int cpu = loaded_vmcs->cpu;
-                smp_call_function_single(
-                        loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+        if (cpu != -1)
+                smp_call_function_single(cpu,
+                         __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
        }
 }
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
-        if (enable_ept) {
-                if (cpu_has_vmx_invept_individual_addr())
-                        __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
-                                        eptp, gpa);
-                else
-                        ept_sync_context(eptp);
-        }
-}
 static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                local_irq_disable();
+                crash_disable_local_vmclear(cpu);
+                /*
+                 * Read loaded_vmcs->cpu should be before fetching
+                 * loaded_vmcs->loaded_vmcss_on_cpu_link.
+                 * See the comments in __loaded_vmcs_clear().
+                 */
+                smp_rmb();
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
+                crash_enable_local_vmclear(cpu);
                local_irq_enable();
                /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
 * Like guest_read_tsc, but always returns L1's notion of the timestamp
 * counter, even if a nested guest (L2) is currently running.
 */
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-        u64 host_tsc, tsc_offset;
+        u64 tsc_offset;
-        rdtscll(host_tsc);
        tsc_offset = is_guest_mode(vcpu) ?
                to_vmx(vcpu)->nested.vmcs01_tsc_offset :
                vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                WARN(1, "user requested TSC rate below hardware speed\n");
 }
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+        return vmcs_read64(TSC_OFFSET);
+}
 /*
 * writes 'offset' into guest's timestamp counter offset register
 */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct shared_msr_entry *msr;
        int ret = 0;
+        u32 msr_index = msr_info->index;
+        u64 data = msr_info->data;
        switch (msr_index) {
        case MSR_EFER:
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
 #ifdef CONFIG_X86_64
        case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_TSC:
-                kvm_write_tsc(vcpu, data);
+                kvm_write_tsc(vcpu, msr_info);
                break;
        case MSR_IA32_CR_PAT:
                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        vcpu->arch.pat = data;
                        break;
                }
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
+                break;
+        case MSR_IA32_TSC_ADJUST:
+                ret = kvm_set_msr_common(vcpu, msr_info);
                break;
        case MSR_TSC_AUX:
                if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        }
                        break;
                }
-                ret = kvm_set_msr_common(vcpu, msr_index, data);
+                ret = kvm_set_msr_common(vcpu, msr_info);
        }
        return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
                return -EBUSY;
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+        /*
+         * Now we can enable the vmclear operation in kdump
+         * since the loaded_vmcss_on_cpu list on this cpu
+         * has been initialized.
+         *
+         * Though the cpu is not in VMX operation now, there
+         * is no problem to enable the vmclear operation
+         * for the loaded_vmcss_on_cpu list is empty!
+         */
+        crash_enable_local_vmclear(cpu);
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
        if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
                tmp.base = vmcs_readl(sf->base);
                tmp.selector = vmcs_read16(sf->selector);
+                tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
                tmp.s = 1;
        }
        vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         * unrestricted guest like Westmere to older host that don't have
         * unrestricted guest like Nehelem.
         */
-        if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+        if (vmx->rmode.vm86_active) {
                switch (seg) {
                case VCPU_SREG_CS:
                        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
-        kvm_write_tsc(&vmx->vcpu, 0);
        return 0;
 }
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        u64 msr;
        int ret;
-        vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
        vmx->rmode.vm86_active = 0;
        vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                msr |= MSR_IA32_APICBASE_BSP;
        kvm_set_apic_base(&vmx->vcpu, msr);
-        ret = fx_init(&vmx->vcpu);
-        if (ret != 0)
-                goto out;
        vmx_segment_cache_clear(vmx);
        seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                kvm_rip_write(vcpu, 0xfff0);
        else
                kvm_rip_write(vcpu, 0);
-        kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        /* HACK: Don't enable emulation on guest boot/reset */
        vmx->emulation_required = 0;
-out:
        return ret;
 }
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_machine_check(intr_info))
                return handle_machine_check(vcpu);
-        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-            !is_page_fault(intr_info)) {
-                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-                vcpu->run->internal.ndata = 2;
-                vcpu->run->internal.data[0] = vect_info;
-                vcpu->run->internal.data[1] = intr_info;
-                return 0;
-        }
        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        error_code = 0;
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+        /*
+         * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+         * MMIO, it is better to report an internal error.
+         * See the comments in vmx_handle_exit.
+         */
+        if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+            !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+                vcpu->run->internal.ndata = 2;
+                vcpu->run->internal.data[0] = vect_info;
+                vcpu->run->internal.data[1] = intr_info;
+                return 0;
+        }
        if (is_page_fault(intr_info)) {
                /* EPT won't cause page fault directly */
                BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
+        struct msr_data msr;
        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
-        if (vmx_set_msr(vcpu, ecx, data) != 0) {
+        msr.data = data;
+        msr.index = ecx;
+        msr.host_initiated = false;
+        if (vmx_set_msr(vcpu, &msr) != 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-        if (exit_qualification & (1 << 6)) {
-                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
-                return -EINVAL;
-        }
        gla_validity = (exit_qualification >> 7) & 0x3;
        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
                return 0;
        }
+        /*
+         * Note:
+         * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+         * delivery event since it indicates guest is accessing MMIO.
+         * The vm-exit can be triggered again after return to guest that
+         * will cause infinite loop.
+         */
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                        exit_reason != EXIT_REASON_TASK_SWITCH))
+                        exit_reason != EXIT_REASON_TASK_SWITCH)) {
-                printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+                vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-                       "(0x%x) and exit reason is 0x%x\n",
+                vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
-                       __func__, vectoring_info, exit_reason);
+                vcpu->run->internal.ndata = 2;
+                vcpu->run->internal.data[0] = vectoring_info;
+                vcpu->run->internal.data[1] = exit_reason;
+                return 0;
+        }
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
        .set_tsc_khz = vmx_set_tsc_khz,
+        .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset = vmx_adjust_tsc_offset,
        .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
        if (r)
                goto out3;
+#ifdef CONFIG_KEXEC
+        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+                           crash_vmclear_local_loaded_vmcss);
+#endif
        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
        free_page((unsigned long)vmx_io_bitmap_b);
        free_page((unsigned long)vmx_io_bitmap_a);
+#ifdef CONFIG_KEXEC
+        rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+        synchronize_rcu();
+#endif
        kvm_exit();
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
 #include <linux/uaccess.h>
 #include <linux/hash.h>
 #include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
 #include <trace/events/kvm.h>
 #define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 u64 __read_mostly host_xcr0;
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
 {
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        }
        if (is_long_mode(vcpu)) {
-                if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+                if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
                        if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
                                return 1;
                } else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
 static unsigned num_msrs_to_save;
 static const u32 emulated_msrs[] = {
+        MSR_IA32_TSC_ADJUST,
        MSR_IA32_TSCDEADLINE,
        MSR_IA32_MISC_ENABLE,
        MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-        return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+        return kvm_x86_ops->set_msr(vcpu, msr);
 }
 /*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 */
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
-        return kvm_set_msr(vcpu, index, *data);
+        struct msr_data msr;
+        msr.data = *data;
+        msr.index = index;
+        msr.host_initiated = true;
+        return kvm_set_msr(vcpu, &msr);
 }
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+        seqcount_t      seq;
+        struct { /* extract of a clocksource struct */
+                int vclock_mode;
+                cycle_t cycle_last;
+                cycle_t mask;
+                u32     mult;
+                u32     shift;
+        } clock;
+        /* open coded 'struct timespec' */
+        u64             monotonic_time_snsec;
+        time_t          monotonic_time_sec;
+};
+static struct pvclock_gtod_data pvclock_gtod_data;
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+        struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+        write_seqcount_begin(&vdata->seq);
+        /* copy pvclock gtod data */
+        vdata->clock.vclock_mode        = tk->clock->archdata.vclock_mode;
+        vdata->clock.cycle_last         = tk->clock->cycle_last;
+        vdata->clock.mask               = tk->clock->mask;
+        vdata->clock.mult               = tk->mult;
+        vdata->clock.shift              = tk->shift;
+        vdata->monotonic_time_sec       = tk->xtime_sec
+                                        + tk->wall_to_monotonic.tv_sec;
+        vdata->monotonic_time_snsec     = tk->xtime_nsec
+                                        + (tk->wall_to_monotonic.tv_nsec
+                                                << tk->shift);
+        while (vdata->monotonic_time_snsec >=
+                                        (((u64)NSEC_PER_SEC) << tk->shift)) {
+                vdata->monotonic_time_snsec -=
+                                        ((u64)NSEC_PER_SEC) << tk->shift;
+                vdata->monotonic_time_sec++;
+        }
+        write_seqcount_end(&vdata->seq);
+}
+#endif
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
        int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
        return timespec_to_ns(&ts);
 }
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
 unsigned long max_tsc_khz;
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
        return tsc;
 }
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+        bool vcpus_matched;
+        bool do_request = false;
+        struct kvm_arch *ka = &vcpu->kvm->arch;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                         atomic_read(&vcpu->kvm->online_vcpus));
+        if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+                if (!ka->use_master_clock)
+                        do_request = 1;
+        if (!vcpus_matched && ka->use_master_clock)
+                        do_request = 1;
+        if (do_request)
+                kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+        trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+                            atomic_read(&vcpu->kvm->online_vcpus),
+                            ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+        u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+        vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
        struct kvm *kvm = vcpu->kvm;
        u64 offset, ns, elapsed;
        unsigned long flags;
        s64 usdiff;
+        bool matched;
+        u64 data = msr->data;
        raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                        offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
                        pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
                }
+                matched = true;
        } else {
                /*
                 * We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
                kvm->arch.cur_tsc_nsec = ns;
                kvm->arch.cur_tsc_write = data;
                kvm->arch.cur_tsc_offset = offset;
+                matched = false;
                pr_debug("kvm: new tsc generation %u, clock %llu\n",
                         kvm->arch.cur_tsc_generation, data);
        }
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
        vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
        vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+        if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+                update_ia32_tsc_adjust_msr(vcpu, offset);
        kvm_x86_ops->write_tsc_offset(vcpu, offset);
        raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+        spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+        if (matched)
+                kvm->arch.nr_vcpus_matched_tsc++;
+        else
+                kvm->arch.nr_vcpus_matched_tsc = 0;
+        kvm_track_tsc_matching(vcpu);
+        spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
 }
 EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+static cycle_t read_tsc(void)
+{
+        cycle_t ret;
+        u64 last;
+        /*
+         * Empirically, a fence (of type that depends on the CPU)
+         * before rdtsc is enough to ensure that rdtsc is ordered
+         * with respect to loads.  The various CPU manuals are unclear
+         * as to whether rdtsc can be reordered with later loads,
+         * but no one has ever seen it happen.
+         */
+        rdtsc_barrier();
+        ret = (cycle_t)vget_cycles();
+        last = pvclock_gtod_data.clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        /*
+         * GCC likes to generate cmov here, but this branch is extremely
+         * predictable (it's just a funciton of time and the likely is
+         * very likely) and there's a data dependence, so force GCC
+         * to generate a branch instead.  I don't barrier() because
+         * we don't actually need a barrier, and if this function
+         * ever gets inlined it will generate worse code.
+         */
+        asm volatile ("");
+        return last;
+}
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+        long v;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        *cycle_now = read_tsc();
+        v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+        return v * gtod->clock.mult;
+}
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+        unsigned long seq;
+        u64 ns;
+        int mode;
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        ts->tv_nsec = 0;
+        do {
+                seq = read_seqcount_begin(&gtod->seq);
+                mode = gtod->clock.vclock_mode;
+                ts->tv_sec = gtod->monotonic_time_sec;
+                ns = gtod->monotonic_time_snsec;
+                ns += vgettsc(cycle_now);
+                ns >>= gtod->clock.shift;
+        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+        timespec_add_ns(ts, ns);
+        return mode;
+}
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+        struct timespec ts;
+        /* checked again under seqlock below */
+        if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+                return false;
+        if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+                return false;
+        monotonic_to_bootbased(&ts);
+        *kernel_ns = timespec_to_ns(&ts);
+        return true;
+}
+#endif
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ *              VCPU0 on CPU0           |       VCPU1 on CPU1
+ *
+ * 1.  read timespec0,tsc0
+ * 2.                                   | timespec1 = timespec0 + N
+ *                                      | tsc1 = tsc0 + M
+ * 3. transition to guest               | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5.                                   | ret1 = timespec1 + (rdtsc - tsc1)
+ *                                      | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ *      - ret0 < ret1
+ *      - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ *              ...
+ *      - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        struct kvm_arch *ka = &kvm->arch;
+        int vclock_mode;
+        bool host_tsc_clocksource, vcpus_matched;
+        vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+                        atomic_read(&kvm->online_vcpus));
+        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        host_tsc_clocksource = kvm_get_time_and_clockread(
+                                        &ka->master_kernel_ns,
+                                        &ka->master_cycle_now);
+        ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+        if (ka->use_master_clock)
+                atomic_set(&kvm_guest_has_master_clock, 1);
+        vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+        trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+                                        vcpus_matched);
+#endif
+}
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
-        unsigned long flags;
+        unsigned long flags, this_tsc_khz;
        struct kvm_vcpu_arch *vcpu = &v->arch;
+        struct kvm_arch *ka = &v->kvm->arch;
        void *shared_kaddr;
-        unsigned long this_tsc_khz;
        s64 kernel_ns, max_kernel_ns;
-        u64 tsc_timestamp;
+        u64 tsc_timestamp, host_tsc;
+        struct pvclock_vcpu_time_info *guest_hv_clock;
        u8 pvclock_flags;
+        bool use_master_clock;
+        kernel_ns = 0;
+        host_tsc = 0;
        /* Keep irq disabled to prevent changes to the clock */
        local_irq_save(flags);
-        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
-        kernel_ns = get_kernel_ns();
        this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
        if (unlikely(this_tsc_khz == 0)) {
                local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        }
        /*
+         * If the host uses TSC clock, then passthrough TSC as stable
+         * to the guest.
+         */
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        use_master_clock = ka->use_master_clock;
+        if (use_master_clock) {
+                host_tsc = ka->master_cycle_now;
+                kernel_ns = ka->master_kernel_ns;
+        }
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+        if (!use_master_clock) {
+                host_tsc = native_read_tsc();
+                kernel_ns = get_kernel_ns();
+        }
+        tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+        /*
         * We may have to catch up the TSC to match elapsed wall clock
         * time for two reasons, even if kvmclock is used.
         *   1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                vcpu->hw_tsc_khz = this_tsc_khz;
        }
-        if (max_kernel_ns > kernel_ns)
+        /* with a master <monotonic time, tsc value> tuple,
-                kernel_ns = max_kernel_ns;
+         * pvclock clock reads always increase at the (scaled) rate
+         * of guest TSC - no need to deal with sampling errors.
+         */
+        if (!use_master_clock) {
+                if (max_kernel_ns > kernel_ns)
+                        kernel_ns = max_kernel_ns;
+        }
        /* With all the info we got, fill in the values */
        vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
        vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
        vcpu->last_kernel_ns = kernel_ns;
        vcpu->last_guest_tsc = tsc_timestamp;
-        pvclock_flags = 0;
-        if (vcpu->pvclock_set_guest_stopped_request) {
-                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-                vcpu->pvclock_set_guest_stopped_request = false;
-        }
-        vcpu->hv_clock.flags = pvclock_flags;
        /*
         * The interface expects us to write an even number signaling that the
         * update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
        shared_kaddr = kmap_atomic(vcpu->time_page);
+        guest_hv_clock = shared_kaddr + vcpu->time_offset;
+        /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+        pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+        if (vcpu->pvclock_set_guest_stopped_request) {
+                pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+                vcpu->pvclock_set_guest_stopped_request = false;
+        }
+        /* If the host uses TSC clocksource, then it is stable */
+        if (use_master_clock)
+                pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+        vcpu->hv_clock.flags = pvclock_flags;
        memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
               sizeof(vcpu->hv_clock));
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        bool pr = false;
+        u32 msr = msr_info->index;
+        u64 data = msr_info->data;
        switch (msr) {
        case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
        case MSR_IA32_TSCDEADLINE:
                kvm_set_lapic_tscdeadline_msr(vcpu, data);
                break;
+        case MSR_IA32_TSC_ADJUST:
+                if (guest_cpuid_has_tsc_adjust(vcpu)) {
+                        if (!msr_info->host_initiated) {
+                                u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+                                kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+                        }
+                        vcpu->arch.ia32_tsc_adjust_msr = data;
+                }
+                break;
        case MSR_IA32_MISC_ENABLE:
                vcpu->arch.ia32_misc_enable_msr = data;
                break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_IA32_TSCDEADLINE:
                data = kvm_get_lapic_tscdeadline_msr(vcpu);
                break;
+        case MSR_IA32_TSC_ADJUST:
+                data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+                break;
        case MSR_IA32_MISC_ENABLE:
                data = vcpu->arch.ia32_misc_enable_msr;
                break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                        kvm_x86_ops->write_tsc_offset(vcpu, offset);
                        vcpu->arch.tsc_catchup = 1;
                }
-                kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+                /*
+                 * On a host with synchronized TSC, there is no need to update
+                 * kvmclock on vcpu->cpu migration
+                 */
+                if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+                        kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
                if (vcpu->cpu != cpu)
                        kvm_migrate_timers(vcpu);
                vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (!vcpu->arch.apic)
                        goto out;
                u.lapic = memdup_user(argp, sizeof(*u.lapic));
-                if (IS_ERR(u.lapic)) {
+                if (IS_ERR(u.lapic))
-                        r = PTR_ERR(u.lapic);
+                        return PTR_ERR(u.lapic);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&irq, argp, sizeof irq))
                        goto out;
                r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_NMI: {
                r = kvm_vcpu_ioctl_nmi(vcpu);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                        goto out;
                r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
                                              cpuid_arg->entries);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XSAVE: {
                u.xsave = memdup_user(argp, sizeof(*u.xsave));
-                if (IS_ERR(u.xsave)) {
+                if (IS_ERR(u.xsave))
-                        r = PTR_ERR(u.xsave);
+                        return PTR_ERR(u.xsave);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
                break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
        }
        case KVM_SET_XCRS: {
                u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
-                if (IS_ERR(u.xcrs)) {
+                if (IS_ERR(u.xcrs))
-                        r = PTR_ERR(u.xcrs);
+                        return PTR_ERR(u.xcrs);
-                        goto out;
-                }
                r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
                break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
        int ret;
        if (addr > (unsigned int)(-3 * PAGE_SIZE))
-                return -1;
+                return -EINVAL;
        ret = kvm_x86_ops->set_tss_addr(kvm, addr);
        return ret;
 }
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
        switch (ioctl) {
        case KVM_SET_TSS_ADDR:
                r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
-                if (r < 0)
-                        goto out;
                break;
        case KVM_SET_IDENTITY_MAP_ADDR: {
                u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
                        goto out;
                r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
-                if (r < 0)
-                        goto out;
                break;
        }
        case KVM_SET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
-                if (r)
-                        goto out;
                break;
        case KVM_GET_NR_MMU_PAGES:
                r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        get_irqchip_out:
                kfree(chip);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                r = 0;
        set_irqchip_out:
                kfree(chip);
-                if (r)
-                        goto out;
                break;
        }
        case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (!kvm->arch.vpit)
                        goto out;
                r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
                if (copy_from_user(&control, argp, sizeof(control)))
                        goto out;
                r = kvm_vm_ioctl_reinject(kvm, &control);
-                if (r)
-                        goto out;
-                r = 0;
                break;
        }
        case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
                            u32 msr_index, u64 data)
 {
-        return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+        struct msr_data msr;
+        msr.data = data;
+        msr.index = msr_index;
+        msr.host_initiated = false;
+        return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
         * instruction -> ...
         */
        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-        if (!is_error_pfn(pfn)) {
+        if (!is_error_noslot_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return true;
        }
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
        kvm_mmu_set_mmio_spte_mask(mask);
 }
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+        struct kvm *kvm;
+        struct kvm_vcpu *vcpu;
+        int i;
+        raw_spin_lock(&kvm_lock);
+        list_for_each_entry(kvm, &vm_list, vm_list)
+                kvm_for_each_vcpu(i, vcpu, kvm)
+                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+        atomic_set(&kvm_guest_has_master_clock, 0);
+        raw_spin_unlock(&kvm_lock);
+}
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+                               void *priv)
+{
+        struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+        struct timekeeper *tk = priv;
+        update_pvclock_gtod(tk);
+        /* disable master clock if host does not trust, or does not
+         * use, TSC clocksource
+         */
+        if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+            atomic_read(&kvm_guest_has_master_clock) != 0)
+                queue_work(system_long_wq, &pvclock_gtod_work);
+        return 0;
+}
+static struct notifier_block pvclock_gtod_notifier = {
+        .notifier_call = pvclock_gtod_notify,
+};
+#endif
 int kvm_arch_init(void *opaque)
 {
        int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
                host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
        kvm_lapic_init();
+#ifdef CONFIG_X86_64
+        pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
        return 0;
 out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
                cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
                                            CPUFREQ_TRANSITION_NOTIFIER);
        unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+        pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
        kvm_x86_ops = NULL;
        kvm_mmu_module_exit();
 }
@@ -5059,7 +5396,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 {
        struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
        char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+        int i;
+        struct kvm_vcpu *vcpu;
+        struct kvm_arch *ka = &kvm->arch;
+        spin_lock(&ka->pvclock_gtod_sync_lock);
+        kvm_make_mclock_inprogress_request(kvm);
+        /* no guest entries from this point */
+        pvclock_update_vm_gtod_copy(kvm);
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+        /* guest entries allowed */
+        kvm_for_each_vcpu(i, vcpu, kvm)
+                clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+        spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_mmu_unload(vcpu);
                if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
                        __kvm_migrate_timers(vcpu);
+                if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+                        kvm_gen_update_masterclock(vcpu->kvm);
                if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
                        r = kvm_guest_time_update(vcpu);
                        if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        if (hw_breakpoint_active())
                hw_breakpoint_restore();
-        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+        vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+                                                           native_read_tsc());
        vcpu->mode = OUTSIDE_GUEST_MODE;
        smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
                pr_debug("vcpu %d received sipi with vector # %x\n",
                         vcpu->vcpu_id, vcpu->arch.sipi_vector);
                kvm_lapic_reset(vcpu);
-                r = kvm_arch_vcpu_reset(vcpu);
+                r = kvm_vcpu_reset(vcpu);
                if (r)
                        return r;
                vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        r = vcpu_load(vcpu);
        if (r)
                return r;
-        r = kvm_arch_vcpu_reset(vcpu);
+        r = kvm_vcpu_reset(vcpu);
        if (r == 0)
                r = kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        return r;
 }
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+        int r;
+        struct msr_data msr;
+        r = vcpu_load(vcpu);
+        if (r)
+                return r;
+        msr.data = 0x0;
+        msr.index = MSR_IA32_TSC;
+        msr.host_initiated = true;
+        kvm_write_tsc(vcpu, &msr);
+        vcpu_put(vcpu);
+        return r;
+}
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
        kvm_x86_ops->vcpu_free(vcpu);
 }
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 {
        atomic_set(&vcpu->arch.nmi_queued, 0);
        vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
        kvm_pmu_reset(vcpu);
+        memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+        vcpu->arch.regs_avail = ~0;
+        vcpu->arch.regs_dirty = ~0;
        return kvm_x86_ops->vcpu_reset(vcpu);
 }
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
                        kvm_for_each_vcpu(i, vcpu, kvm) {
                                vcpu->arch.tsc_offset_adjustment += delta_cyc;
                                vcpu->arch.last_host_tsc = local_tsc;
+                                set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+                                        &vcpu->requests);
                        }
                        /*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
                goto fail_free_mce_banks;
+        r = fx_init(vcpu);
+        if (r)
+                goto fail_free_wbinvd_dirty_mask;
+        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
        return 0;
+fail_free_wbinvd_dirty_mask:
+        free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 fail_free_mce_banks:
        kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        raw_spin_lock_init(&kvm->arch.tsc_write_lock);
        mutex_init(&kvm->arch.apic_map_lock);
+        spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+        pvclock_update_vm_gtod_copy(kvm);
        return 0;
 }
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac8..e224f7a671b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421a..205ad328aa52 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
 #include <asm/hpet.h>
 #include <asm/unistd.h>
 #include <asm/io.h>
+#include <asm/pvclock.h>
 #define gtod (&VVAR(vsyscall_gtod_data))
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
        return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
 }
+#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+{
+        const struct pvclock_vsyscall_time_info *pvti_base;
+        int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+        int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+        BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+        pvti_base = (struct pvclock_vsyscall_time_info *)
+                    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+        return &pvti_base[offset];
+}
+static notrace cycle_t vread_pvclock(int *mode)
+{
+        const struct pvclock_vsyscall_time_info *pvti;
+        cycle_t ret;
+        u64 last;
+        u32 version;
+        u32 migrate_count;
+        u8 flags;
+        unsigned cpu, cpu1;
+        /*
+         * When looping to get a consistent (time-info, tsc) pair, we
+         * also need to deal with the possibility we can switch vcpus,
+         * so make sure we always re-fetch time-info for the current vcpu.
+         */
+        do {
+                cpu = __getcpu() & VGETCPU_CPU_MASK;
+                /* TODO: We can put vcpu id into higher bits of pvti.version.
+                 * This will save a couple of cycles by getting rid of
+                 * __getcpu() calls (Gleb).
+                 */
+                pvti = get_pvti(cpu);
+                migrate_count = pvti->migrate_count;
+                version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+                /*
+                 * Test we're still on the cpu as well as the version.
+                 * We could have been migrated just after the first
+                 * vgetcpu but before fetching the version, so we
+                 * wouldn't notice a version change.
+                 */
+                cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+        } while (unlikely(cpu != cpu1 ||
+                          (pvti->pvti.version & 1) ||
+                          pvti->pvti.version != version ||
+                          pvti->migrate_count != migrate_count));
+        if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+                *mode = VCLOCK_NONE;
+        /* refer to tsc.c read_tsc() comment for rationale */
+        last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+        if (likely(ret >= last))
+                return ret;
+        return last;
+}
+#endif
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
        long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 }
-notrace static inline u64 vgetsns(void)
+notrace static inline u64 vgetsns(int *mode)
 {
        long v;
        cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
                cycles = vread_tsc();
        else if (gtod->clock.vclock_mode == VCLOCK_HPET)
                cycles = vread_hpet();
+#ifdef CONFIG_PARAVIRT_CLOCK
+        else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
+                cycles = vread_pvclock(mode);
+#endif
        else
                return 0;
        v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
                mode = gtod->clock.vclock_mode;
                ts->tv_sec = gtod->wall_time_sec;
                ns = gtod->wall_time_snsec;
-                ns += vgetsns();
+                ns += vgetsns(&mode);
                ns >>= gtod->clock.shift;
        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
                mode = gtod->clock.vclock_mode;
                ts->tv_sec = gtod->monotonic_time_sec;
                ns = gtod->monotonic_time_snsec;
-                ns += vgetsns();
+                ns += vgetsns(&mode);
                ns >>= gtod->clock.shift;
        } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
        timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad558573..2f94b039e55b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
        unsigned int p;
-        if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
+        p = __getcpu();
-                /* Load per CPU data from RDTSCP */
-                native_read_tscp(&p);
-        } else {
-                /* Load per CPU data from GDT */
-                asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-        }
        if (cpu)
-                *cpu = p & 0xfff;
+                *cpu = p & VGETCPU_CPU_MASK;
        if (node)
                *node = p >> 12;
        return 0;
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-13 18:31:08 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-13 18:31:08 -0500
commit	66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree	4892eaa422d366fce5d1e866ff1fe0988af95569 /arch/x86
parent	896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent	58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)