18 files changed, 489 insertions, 214 deletions
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 15f960c06ff7..24ec1216596e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -274,13 +274,17 @@ struct x86_emulate_ctxt {
        bool guest_mode; /* guest running a nested guest */
        bool perm_ok; /* do not check permissions if true */
-        bool only_vendor_specific_insn;
+        bool ud;        /* inject an #UD if host doesn't support insn */
        bool have_exception;
        struct x86_exception exception;
-        /* decode cache */
+        /*
-        u8 twobyte;
+         * decode cache
+         */
+        /* current opcode length in bytes */
+        u8 opcode_len;
        u8 b;
        u8 intercept;
        u8 lock_prefix;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c76ff74a98f2..ae5d7830855c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,6 +79,13 @@
 #define KVM_HPAGE_MASK(x)       (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)  (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+        /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+        return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+                (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
@@ -253,7 +260,6 @@ struct kvm_pio_request {
 * mode.
 */
 struct kvm_mmu {
-        void (*new_cr3)(struct kvm_vcpu *vcpu);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
        unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
        u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
@@ -261,7 +267,6 @@ struct kvm_mmu {
                          bool prefault);
        void (*inject_page_fault)(struct kvm_vcpu *vcpu,
                                  struct x86_exception *fault);
-        void (*free)(struct kvm_vcpu *vcpu);
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
                            struct x86_exception *exception);
        gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
@@ -389,6 +394,8 @@ struct kvm_vcpu_arch {
        struct fpu guest_fpu;
        u64 xcr0;
+        u64 guest_supported_xcr0;
+        u32 guest_xstate_size;
        struct kvm_pio_request pio;
        void *pio_data;
@@ -557,7 +564,9 @@ struct kvm_arch {
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
-        int iommu_flags;
+        bool iommu_noncoherent;
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+        atomic_t noncoherent_dma_count;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
@@ -780,11 +789,11 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
@@ -922,13 +931,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
                       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
-int complete_pio(struct kvm_vcpu *vcpu);
-bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
        return gpa;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index be8269b00e2a..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -14,6 +14,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
                            struct timespec *ts);
 void pvclock_resume(void);
+void pvclock_touch_watchdogs(void);
 /*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5d9a3033b3d7..d3a87780c70b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 {
        __u32 padding[3];
 };
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX         BIT(0)
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATEFUL_FUNC            BIT(1)
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_STATE_READ_NEXT          BIT(2)
 /* for KVM_SET_CPUID2 */
 struct kvm_cpuid2 {
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index bb0465090ae5..b93e09a0fa21 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -536,6 +536,7 @@
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 #define MSR_VM_CR                       0xc0010114
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1570e0741344..e6041094ff26 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void)
        src = &hv_clock[cpu].pvti;
        if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
                src->flags &= ~PVCLOCK_GUEST_STOPPED;
+                pvclock_touch_watchdogs();
                ret = true;
        }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index a16bae3f83b3..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
        return pv_tsc_khz;
 }
+void pvclock_touch_watchdogs(void)
+{
+        touch_softlockup_watchdog_sync();
+        clocksource_touch_watchdog();
+        rcu_cpu_stall_reset();
+        reset_hung_task_detector();
+}
 static atomic64_t last_value = ATOMIC64_INIT(0);
 void pvclock_resume(void)
@@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
                version = __pvclock_read_cycles(src, &ret, &flags);
        } while ((src->version & 1) || version != src->version);
+        if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+                src->flags &= ~PVCLOCK_GUEST_STOPPED;
+                pvclock_touch_watchdogs();
+        }
        if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
                (flags & PVCLOCK_TSC_STABLE_BIT))
                return ret;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a47a3e54b964..b89c5db2b832 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -38,6 +38,7 @@ config KVM
        select PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+        select KVM_VFIO
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index bf4fb04d0112..25d22b2d6509 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,7 @@ KVM := ../../../virt/kvm
 kvm-y                   += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
                                $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
-                                $(KVM)/eventfd.o $(KVM)/irqchip.o
+                                $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)     += $(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF)      += $(KVM)/async_pf.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b110fe6c03d4..c6976257eff5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -23,6 +23,26 @@
 #include "mmu.h"
 #include "trace.h"
+static u32 xstate_required_size(u64 xstate_bv)
+{
+        int feature_bit = 0;
+        u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+        xstate_bv &= ~XSTATE_FPSSE;
+        while (xstate_bv) {
+                if (xstate_bv & 0x1) {
+                        u32 eax, ebx, ecx, edx;
+                        cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+                        ret = max(ret, eax + ebx);
+                }
+                xstate_bv >>= 1;
+                feature_bit++;
+        }
+        return ret;
+}
 void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -46,6 +66,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
                        apic->lapic_timer.timer_mode_mask = 1 << 17;
        }
+        best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+        if (!best) {
+                vcpu->arch.guest_supported_xcr0 = 0;
+                vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+        } else {
+                vcpu->arch.guest_supported_xcr0 =
+                        (best->eax | ((u64)best->edx << 32)) &
+                        host_xcr0 & KVM_SUPPORTED_XCR0;
+                vcpu->arch.guest_xstate_size =
+                        xstate_required_size(vcpu->arch.guest_supported_xcr0);
+        }
        kvm_pmu_cpuid_update(vcpu);
 }
@@ -182,13 +214,35 @@ static bool supported_xcr0_bit(unsigned bit)
 {
        u64 mask = ((u64)1 << bit);
-        return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+        return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
 }
 #define F(x) bit(X86_FEATURE_##x)
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
-                         u32 index, int *nent, int maxnent)
+                                   u32 func, u32 index, int *nent, int maxnent)
+{
+        switch (func) {
+        case 0:
+                entry->eax = 1;         /* only one leaf currently */
+                ++*nent;
+                break;
+        case 1:
+                entry->ecx = F(MOVBE);
+                ++*nent;
+                break;
+        default:
+                break;
+        }
+        entry->function = func;
+        entry->index = index;
+        return 0;
+}
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                                 u32 index, int *nent, int maxnent)
 {
        int r;
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -383,6 +437,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 0xd: {
                int idx, i;
+                entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
+                entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                for (idx = 1, i = 1; idx < 64; ++idx) {
                        if (*nent >= maxnent)
@@ -481,6 +537,15 @@ out:
        return r;
 }
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+                        u32 idx, int *nent, int maxnent, unsigned int type)
+{
+        if (type == KVM_GET_EMULATED_CPUID)
+                return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+        return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
 #undef F
 struct kvm_cpuid_param {
@@ -495,8 +560,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
        return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
-                                      struct kvm_cpuid_entry2 __user *entries)
+                                 __u32 num_entries, unsigned int ioctl_type)
+{
+        int i;
+        __u32 pad[3];
+        if (ioctl_type != KVM_GET_EMULATED_CPUID)
+                return false;
+        /*
+         * We want to make sure that ->padding is being passed clean from
+         * userspace in case we want to use it for something in the future.
+         *
+         * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+         * have to give ourselves satisfied only with the emulated side. /me
+         * sheds a tear.
+         */
+        for (i = 0; i < num_entries; i++) {
+                if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+                        return true;
+                if (pad[0] || pad[1] || pad[2])
+                        return true;
+        }
+        return false;
+}
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+                            struct kvm_cpuid_entry2 __user *entries,
+                            unsigned int type)
 {
        struct kvm_cpuid_entry2 *cpuid_entries;
        int limit, nent = 0, r = -E2BIG, i;
@@ -513,8 +606,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                goto out;
        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+        if (sanity_check_entries(entries, cpuid->nent, type))
+                return -EINVAL;
        r = -ENOMEM;
-        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+        cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
        if (!cpuid_entries)
                goto out;
@@ -526,7 +623,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                        continue;
                r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
-                                &nent, cpuid->nent);
+                                &nent, cpuid->nent, type);
                if (r)
                        goto out_free;
@@ -537,7 +634,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                limit = cpuid_entries[nent - 1].eax;
                for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
                        r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
-                                     &nent, cpuid->nent);
+                                     &nent, cpuid->nent, type);
                if (r)
                        goto out_free;
@@ -661,6 +758,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
                *edx = best->edx;
        } else
                *eax = *ebx = *ecx = *edx = 0;
+        trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
@@ -676,6 +774,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
        kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
        kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-        trace_kvm_cpuid(function, eax, ebx, ecx, edx);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b7fd07984888..f1e4895174b2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,8 +6,9 @@
 void kvm_update_cpuid(struct kvm_vcpu *vcpu);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
-                                      struct kvm_cpuid_entry2 __user *entries);
+                            struct kvm_cpuid_entry2 __user *entries,
+                            unsigned int type);
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid *cpuid,
                             struct kvm_cpuid_entry __user *entries);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ddc3f3d2afdb..07ffca0a89e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -130,7 +130,7 @@
 #define Mov         (1<<20)
 /* Misc flags */
 #define Prot        (1<<21) /* instruction generates #UD if not in prot-mode */
-#define VendorSpecific (1<<22) /* Vendor specific instruction */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
 #define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
@@ -785,9 +785,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
 */
 static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
-                             int highbyte_regs)
+                             int byteop)
 {
        void *p;
+        int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
        if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
                p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
@@ -1024,7 +1025,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                    struct operand *op)
 {
        unsigned reg = ctxt->modrm_reg;
-        int highbyte_regs = ctxt->rex_prefix == 0;
        if (!(ctxt->d & ModRM))
                reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
@@ -1045,13 +1045,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
        }
        op->type = OP_REG;
-        if (ctxt->d & ByteOp) {
+        op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-                op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
+        op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
-                op->bytes = 1;
-        } else {
-                op->addr.reg = decode_register(ctxt, reg, 0);
-                op->bytes = ctxt->op_bytes;
-        }
        fetch_register_operand(op);
        op->orig_val = op->val;
 }
@@ -1082,12 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        ctxt->modrm_seg = VCPU_SREG_DS;
        if (ctxt->modrm_mod == 3) {
-                int highbyte_regs = ctxt->rex_prefix == 0;
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
-                                               highbyte_regs && (ctxt->d & ByteOp));
+                                ctxt->d & ByteOp);
                if (ctxt->d & Sse) {
                        op->type = OP_XMM;
                        op->bytes = 16;
@@ -2961,6 +2955,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
+#define FFL(x) bit(X86_FEATURE_##x)
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+        u32 ebx, ecx, edx, eax = 1;
+        u16 tmp;
+        /*
+         * Check MOVBE is set in the guest-visible CPUID leaf.
+         */
+        ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+        if (!(ecx & FFL(MOVBE)))
+                return emulate_ud(ctxt);
+        switch (ctxt->op_bytes) {
+        case 2:
+                /*
+                 * From MOVBE definition: "...When the operand size is 16 bits,
+                 * the upper word of the destination register remains unchanged
+                 * ..."
+                 *
+                 * Both casting ->valptr and ->val to u16 breaks strict aliasing
+                 * rules so we have to do the operation almost per hand.
+                 */
+                tmp = (u16)ctxt->src.val;
+                ctxt->dst.val &= ~0xffffUL;
+                ctxt->dst.val |= (unsigned long)swab16(tmp);
+                break;
+        case 4:
+                ctxt->dst.val = swab32((u32)ctxt->src.val);
+                break;
+        case 8:
+                ctxt->dst.val = swab64(ctxt->src.val);
+                break;
+        default:
+                return X86EMUL_PROPAGATE_FAULT;
+        }
+        return X86EMUL_CONTINUE;
+}
 static int em_cr_write(struct x86_emulate_ctxt *ctxt)
 {
        if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
@@ -3256,6 +3290,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+        u32 flags;
+        flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+        flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+        ctxt->eflags &= ~0xffUL;
+        ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+        return X86EMUL_CONTINUE;
+}
 static int em_lahf(struct x86_emulate_ctxt *ctxt)
 {
        *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
@@ -3502,7 +3548,7 @@ static const struct opcode group7_rm1[] = {
 static const struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              vmrun,          check_svme_pa),
-        II(SrcNone  | Prot | VendorSpecific,    em_vmmcall,     vmmcall),
+        II(SrcNone  | Prot | EmulateOnUD,       em_vmmcall,     vmmcall),
        DIP(SrcNone | Prot | Priv,              vmload,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              vmsave,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              stgi,           check_svme),
@@ -3587,7 +3633,7 @@ static const struct group_dual group7 = { {
        II(SrcMem16 | Mov | Priv,               em_lmsw, lmsw),
        II(SrcMem | ByteOp | Priv | NoAccess,   em_invlpg, invlpg),
 }, {
-        I(SrcNone | Priv | VendorSpecific,      em_vmcall),
+        I(SrcNone | Priv | EmulateOnUD, em_vmcall),
        EXT(0, group7_rm1),
        N, EXT(0, group7_rm3),
        II(SrcNone | DstMem | Mov,              em_smsw, smsw), N,
@@ -3750,7 +3796,8 @@ static const struct opcode opcode_table[256] = {
        D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
        I(SrcImmFAddr | No64, em_call_far), N,
        II(ImplicitOps | Stack, em_pushf, pushf),
-        II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
+        II(ImplicitOps | Stack, em_popf, popf),
+        I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
        /* 0xA0 - 0xA7 */
        I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
        I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3810,7 +3857,7 @@ static const struct opcode opcode_table[256] = {
 static const struct opcode twobyte_table[256] = {
        /* 0x00 - 0x0F */
        G(0, group6), GD(0, &group7), N, N,
-        N, I(ImplicitOps | VendorSpecific, em_syscall),
+        N, I(ImplicitOps | EmulateOnUD, em_syscall),
        II(ImplicitOps | Priv, em_clts, clts), N,
        DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
        N, D(ImplicitOps | ModRM), N, N,
@@ -3830,8 +3877,8 @@ static const struct opcode twobyte_table[256] = {
        IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
        II(ImplicitOps | Priv, em_rdmsr, rdmsr),
        IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
-        I(ImplicitOps | VendorSpecific, em_sysenter),
+        I(ImplicitOps | EmulateOnUD, em_sysenter),
-        I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
+        I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
        N, N,
        N, N, N, N, N, N, N, N,
        /* 0x40 - 0x4F */
@@ -3892,6 +3939,30 @@ static const struct opcode twobyte_table[256] = {
        N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
+static const struct gprefix three_byte_0f_38_f0 = {
+        I(DstReg | SrcMem | Mov, em_movbe), N, N, N
+};
+static const struct gprefix three_byte_0f_38_f1 = {
+        I(DstMem | SrcReg | Mov, em_movbe), N, N, N
+};
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+        /* 0x00 - 0x7f */
+        X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+        /* 0x80 - 0xef */
+        X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+        /* 0xf0 - 0xf1 */
+        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
+        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+        /* 0xf2 - 0xff */
+        N, N, X4(N), X8(N)
+};
 #undef D
 #undef N
 #undef G
@@ -4040,7 +4111,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        case OpMem8:
                ctxt->memop.bytes = 1;
                if (ctxt->memop.type == OP_REG) {
-                        ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+                        ctxt->memop.addr.reg = decode_register(ctxt,
+                                        ctxt->modrm_rm, true);
                        fetch_register_operand(&ctxt->memop);
                }
                goto mem_common;
@@ -4126,6 +4198,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        ctxt->_eip = ctxt->eip;
        ctxt->fetch.start = ctxt->_eip;
        ctxt->fetch.end = ctxt->fetch.start + insn_len;
+        ctxt->opcode_len = 1;
        if (insn_len > 0)
                memcpy(ctxt->fetch.data, insn, insn_len);
@@ -4208,9 +4281,16 @@ done_prefixes:
        opcode = opcode_table[ctxt->b];
        /* Two-byte opcode? */
        if (ctxt->b == 0x0f) {
-                ctxt->twobyte = 1;
+                ctxt->opcode_len = 2;
                ctxt->b = insn_fetch(u8, ctxt);
                opcode = twobyte_table[ctxt->b];
+                /* 0F_38 opcode map */
+                if (ctxt->b == 0x38) {
+                        ctxt->opcode_len = 3;
+                        ctxt->b = insn_fetch(u8, ctxt);
+                        opcode = opcode_map_0f_38[ctxt->b];
+                }
        }
        ctxt->d = opcode.flags;
@@ -4267,7 +4347,7 @@ done_prefixes:
        if (ctxt->d == 0 || (ctxt->d & NotImpl))
                return EMULATION_FAILED;
-        if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+        if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
                return EMULATION_FAILED;
        if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
@@ -4540,8 +4620,10 @@ special_insn:
                goto writeback;
        }
-        if (ctxt->twobyte)
+        if (ctxt->opcode_len == 2)
                goto twobyte_insn;
+        else if (ctxt->opcode_len == 3)
+                goto threebyte_insn;
        switch (ctxt->b) {
        case 0x63:              /* movsxd */
@@ -4726,6 +4808,8 @@ twobyte_insn:
                goto cannot_emulate;
        }
+threebyte_insn:
        if (rc != X86EMUL_CONTINUE)
                goto done;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dce0df8150df..40772ef0f2b1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2570,11 +2570,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        kvm_release_pfn_clean(pfn);
 }
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-        mmu_free_roots(vcpu);
-}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -3424,18 +3419,11 @@ out_unlock:
        return 0;
 }
-static void nonpaging_free(struct kvm_vcpu *vcpu)
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
-{
+                                   struct kvm_mmu *context)
-        mmu_free_roots(vcpu);
-}
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu *context)
 {
-        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
-        context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3444,7 +3432,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->root_hpa = INVALID_PAGE;
        context->direct_map = true;
        context->nx = false;
-        return 0;
 }
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -3454,9 +3441,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
 {
-        pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
        mmu_free_roots(vcpu);
 }
@@ -3471,11 +3457,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
        vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-        nonpaging_free(vcpu);
-}
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3665,9 +3646,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
        mmu->last_pte_bitmap = map;
 }
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
-                                        struct kvm_mmu *context,
+                                         struct kvm_mmu *context,
-                                        int level)
+                                         int level)
 {
        context->nx = is_nx(vcpu);
        context->root_level = level;
@@ -3677,27 +3658,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        update_last_pte_bitmap(vcpu, context);
        ASSERT(is_pae(vcpu));
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
-        context->free = paging_free;
        context->shadow_root_level = level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-        return 0;
 }
-static int paging64_init_context(struct kvm_vcpu *vcpu,
+static void paging64_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+                                  struct kvm_mmu *context)
 {
-        return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+        paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
-static int paging32_init_context(struct kvm_vcpu *vcpu,
+static void paging32_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+                                  struct kvm_mmu *context)
 {
        context->nx = false;
        context->root_level = PT32_ROOT_LEVEL;
@@ -3706,33 +3684,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
-        context->free = paging_free;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-        return 0;
 }
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu *context)
+                                   struct kvm_mmu *context)
 {
-        return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+        paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = vcpu->arch.walk_mmu;
        context->base_role.word = 0;
-        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
-        context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3767,37 +3740,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-        return 0;
 }
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
-        int r;
        bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
        if (!is_paging(vcpu))
-                r = nonpaging_init_context(vcpu, context);
+                nonpaging_init_context(vcpu, context);
        else if (is_long_mode(vcpu))
-                r = paging64_init_context(vcpu, context);
+                paging64_init_context(vcpu, context);
        else if (is_pae(vcpu))
-                r = paging32E_init_context(vcpu, context);
+                paging32E_init_context(vcpu, context);
        else
-                r = paging32_init_context(vcpu, context);
+                paging32_init_context(vcpu, context);
        vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
                = smep && !is_write_protection(vcpu);
-        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly)
 {
        ASSERT(vcpu);
@@ -3806,37 +3774,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
        context->nx = true;
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = ept_page_fault;
        context->gva_to_gpa = ept_gva_to_gpa;
        context->sync_page = ept_sync_page;
        context->invlpg = ept_invlpg;
        context->update_pte = ept_update_pte;
-        context->free = paging_free;
        context->root_level = context->shadow_root_level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
        update_permission_bitmask(vcpu, context, true);
        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-        int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+        kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
        vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
        vcpu->arch.walk_mmu->get_cr3           = get_cr3;
        vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
        vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-        return r;
 }
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
@@ -3873,11 +3834,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
-        return 0;
 }
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        if (mmu_is_nested(vcpu))
                return init_kvm_nested_mmu(vcpu);
@@ -3887,18 +3846,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
                return init_kvm_softmmu(vcpu);
 }
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-        if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
-                /* mmu.free() should set root_hpa = INVALID_PAGE */
-                vcpu->arch.mmu.free(vcpu);
-}
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+        kvm_mmu_unload(vcpu);
-{
+        init_kvm_mmu(vcpu);
-        destroy_kvm_mmu(vcpu);
-        return init_kvm_mmu(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -3923,6 +3876,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
        mmu_free_roots(vcpu);
+        WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4281,12 +4235,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        return alloc_mmu_pages(vcpu);
 }
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        return init_kvm_mmu(vcpu);
+        init_kvm_mmu(vcpu);
 }
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
@@ -4428,7 +4382,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -4478,9 +4432,8 @@ unlock:
                break;
        }
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
        return freed;
 }
 static unsigned long
@@ -4574,7 +4527,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-        destroy_kvm_mmu(vcpu);
+        kvm_mmu_unload(vcpu);
        free_mmu_pages(vcpu);
        mmu_free_memory_caches(vcpu);
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 77e044a0f5f7..292615274358 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -70,8 +70,8 @@ enum {
 };
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c0bc80391e40..c7168a5cff1b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1959,11 +1959,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
        nested_svm_vmexit(svm);
 }
-static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        int r;
+        kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-        r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
@@ -1971,8 +1969,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu.shadow_root_level = get_npt_level();
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-        return r;
 }
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b2fce1b2009..b2fe1c252f35 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1498,7 +1498,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        break;
        if (i == NR_AUTOLOAD_MSRS) {
-                printk_once(KERN_WARNING"Not enough mst switch entries. "
+                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
        } else if (i == m->nr) {
@@ -1898,16 +1898,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 /*
 * KVM wants to inject page-faults which it got to the guest. This function
 * checks whether in a nested guest, we need to inject them to L1 or L2.
- * This function assumes it is called with the exit reason in vmcs02 being
- * a #PF exception (this is the only case in which KVM injects a #PF when L2
- * is running).
 */
-static int nested_pf_handled(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+        if (!(vmcs12->exception_bitmap & (1u << nr)))
-        if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                return 0;
        nested_vmx_vmexit(vcpu);
@@ -1921,8 +1917,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
-        if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+        if (!reinject && is_guest_mode(vcpu) &&
-            !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
+            nested_vmx_check_exception(vcpu, nr))
                return;
        if (has_error_code) {
@@ -2204,9 +2200,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+        if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+            !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+                nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+                nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+        }
        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-                                      VM_EXIT_LOAD_IA32_EFER);
+                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2226,7 +2228,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
        nested_vmx_procbased_ctls_low = 0;
        nested_vmx_procbased_ctls_high &=
-                CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+                CPU_BASED_VIRTUAL_INTR_PENDING |
+                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
                CPU_BASED_CR3_STORE_EXITING |
@@ -2252,13 +2255,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING;
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
+                         VMX_EPT_INVEPT_BIT;
                nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * Since invept is completely emulated we support both global
@@ -3380,8 +3385,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-                guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
+                if (is_paging(vcpu) || is_guest_mode(vcpu))
-                        vcpu->kvm->arch.ept_identity_map_addr;
+                        guest_cr3 = kvm_read_cr3(vcpu);
+                else
+                        guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
@@ -4879,6 +4886,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
+static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+{
+        unsigned long always_on = VMXON_CR0_ALWAYSON;
+        if (nested_vmx_secondary_ctls_high &
+                SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+            nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+                always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+        return (val & always_on) == always_on;
+}
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
@@ -4897,9 +4915,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
-                /* TODO: will have to take unrestricted guest mode into
+                if (!nested_cr0_valid(vmcs12, val))
-                 * account */
-                if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
                        return 1;
                if (kvm_set_cr0(vcpu, val))
@@ -6627,6 +6643,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
+                else if (is_no_device(intr_info) &&
+                         !(nested_read_cr0(vmcs12) & X86_CR0_TS))
+                        return 0;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -6722,6 +6741,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+        u64 delta_tsc_l1;
+        u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+        if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
+                        PIN_BASED_VMX_PREEMPTION_TIMER))
+                return;
+        preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+                        MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+        preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+        delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
+                - vcpu->arch.last_guest_tsc;
+        preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+        if (preempt_val_l2 <= preempt_val_l1)
+                preempt_val_l2 = 0;
+        else
+                preempt_val_l2 -= preempt_val_l1;
+        vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
 /*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
@@ -6736,20 +6776,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
-        /*
-         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
-         * we did not inject a still-pending event to L1 now because of
-         * nested_run_pending, we need to re-enable this bit.
-         */
-        if (vmx->nested.nested_run_pending)
-                kvm_make_request(KVM_REQ_EVENT, vcpu);
-        if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
-            exit_reason == EXIT_REASON_VMRESUME))
-                vmx->nested.nested_run_pending = 1;
-        else
-                vmx->nested.nested_run_pending = 0;
        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
                nested_vmx_vmexit(vcpu);
                return 1;
@@ -7061,9 +7087,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                        kvm_queue_exception_e(vcpu, vector, err);
+                        kvm_requeue_exception_e(vcpu, vector, err);
                } else
-                        kvm_queue_exception(vcpu, vector);
+                        kvm_requeue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
@@ -7146,6 +7172,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
+        if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
+                nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -7284,6 +7312,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+        /*
+         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+         * we did not inject a still-pending event to L1 now because of
+         * nested_run_pending, we need to re-enable this bit.
+         */
+        if (vmx->nested.nested_run_pending)
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+        vmx->nested.nested_run_pending = 0;
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@ -7410,8 +7448,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
         */
        if (is_mmio)
                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-        else if (vcpu->kvm->arch.iommu_domain &&
+        else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
-                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
                      VMX_EPT_MT_EPTE_SHIFT;
        else
@@ -7501,9 +7538,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+        kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
@@ -7511,8 +7548,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-        return r;
 }
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -7520,6 +7555,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+                struct x86_exception *fault)
+{
+        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+        WARN_ON(!is_guest_mode(vcpu));
+        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+        if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+                nested_vmx_vmexit(vcpu);
+        else
+                kvm_inject_page_fault(vcpu, fault);
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7533,6 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
+        u32 exit_control;
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7706,7 +7756,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+        exit_control = vmcs_config.vmexit_ctrl;
+        if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+                exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+        vmcs_write32(VM_EXIT_CONTROLS, exit_control);
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
@@ -7773,6 +7826,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
+        if (!enable_ept)
+                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
        /*
         * L1 may access the L2's PDPTR, so save them to construct vmcs12
         */
@@ -7876,7 +7932,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
-        if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+        if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -7938,6 +7994,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        enter_guest_mode(vcpu);
+        vmx->nested.nested_run_pending = 1;
        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
        cpu = get_cpu();
@@ -8005,7 +8063,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
        u32 idt_vectoring;
        unsigned int nr;
-        if (vcpu->arch.exception.pending) {
+        if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -8023,7 +8081,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
                }
                vmcs12->idt_vectoring_info_field = idt_vectoring;
-        } else if (vcpu->arch.nmi_pending) {
+        } else if (vcpu->arch.nmi_injected) {
                vmcs12->idt_vectoring_info_field =
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
        } else if (vcpu->arch.interrupt.pending) {
@@ -8105,6 +8163,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+        if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+            (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+                vmcs12->vmx_preemption_timer_value =
+                        vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
         * own CR3 without exiting. If it has changed it, we must keep it.
@@ -8130,6 +8193,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+                vmcs12->guest_ia32_efer = vcpu->arch.efer;
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
@@ -8201,7 +8266,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         * fpu_active (which may have changed).
         * Note that vmx_set_cr0 refers to efer set above.
         */
-        kvm_set_cr0(vcpu, vmcs12->host_cr0);
+        vmx_set_cr0(vcpu, vmcs12->host_cr0);
        /*
         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
@@ -8224,6 +8289,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
+        if (!enable_ept)
+                vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
        if (enable_vpid) {
                /*
                 * Trivially support vpid by letting L2s share their parent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5ca72a5cdb6..21ef1ba184ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -577,6 +577,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
        u64 xcr0;
+        u64 valid_bits;
        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
        if (index != XCR_XFEATURE_ENABLED_MASK)
@@ -586,8 +587,16 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
                return 1;
        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
                return 1;
-        if (xcr0 & ~host_xcr0)
+        /*
+         * Do not allow the guest to set bits that we do not support
+         * saving.  However, xcr0 bit 0 is always set, even if the
+         * emulated CPU does not support XSAVE (see fx_init).
+         */
+        valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+        if (xcr0 & ~valid_bits)
                return 1;
        kvm_put_guest_xcr0(vcpu);
        vcpu->arch.xcr0 = xcr0;
        return 0;
@@ -684,7 +693,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-        vcpu->arch.mmu.new_cr3(vcpu);
+        kvm_mmu_new_cr3(vcpu);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2564,6 +2573,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
+        case KVM_CAP_EXT_EMUL_CPUID:
        case KVM_CAP_CLOCKSOURCE:
        case KVM_CAP_PIT:
        case KVM_CAP_NOP_IO_DELAY:
@@ -2673,15 +2683,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
-        case KVM_GET_SUPPORTED_CPUID: {
+        case KVM_GET_SUPPORTED_CPUID:
+        case KVM_GET_EMULATED_CPUID: {
                struct kvm_cpuid2 __user *cpuid_arg = argp;
                struct kvm_cpuid2 cpuid;
                r = -EFAULT;
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
-                r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-                                                      cpuid_arg->entries);
+                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+                                            ioctl);
                if (r)
                        goto out;
@@ -2715,8 +2727,7 @@ static void wbinvd_ipi(void *garbage)
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-        return vcpu->kvm->arch.iommu_domain &&
+        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
-                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
 }
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2984,11 +2995,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
 {
-        if (cpu_has_xsave)
+        if (cpu_has_xsave) {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->xsave,
-                        xstate_size);
+                        vcpu->arch.guest_xstate_size);
-        else {
+                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
+                        vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+        } else {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->fxsave,
                        sizeof(struct i387_fxsave_struct));
@@ -3003,10 +3016,19 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
        u64 xstate_bv =
                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-        if (cpu_has_xsave)
+        if (cpu_has_xsave) {
+                /*
+                 * Here we allow setting states that are not present in
+                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
+                 * with old userspace.
+                 */
+                if (xstate_bv & ~KVM_SUPPORTED_XCR0)
+                        return -EINVAL;
+                if (xstate_bv & ~host_xcr0)
+                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->xsave,
-                        guest_xsave->region, xstate_size);
+                        guest_xsave->region, vcpu->arch.guest_xstate_size);
-        else {
+        } else {
                if (xstate_bv & ~XSTATE_FPSSE)
                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->fxsave,
@@ -3042,9 +3064,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
                /* Only support XCR0 currently */
-                if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+                if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
-                                guest_xcrs->xcrs[0].value);
+                                guest_xcrs->xcrs[i].value);
                        break;
                }
        if (r)
@@ -4775,8 +4797,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
-        memset(&ctxt->twobyte, 0,
+        memset(&ctxt->opcode_len, 0,
-               (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
+               (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
        ctxt->fetch.start = 0;
        ctxt->fetch.end = 0;
@@ -5094,8 +5116,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
-                ctxt->only_vendor_specific_insn
+                ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-                        = emulation_type & EMULTYPE_TRAP_UD;
                r = x86_decode_insn(ctxt, insn, insn_len);
@@ -5263,7 +5284,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
        smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
@@ -5273,7 +5294,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
                                send_ipi = 1;
                }
        }
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
        if (freq->old < freq->new && send_ipi) {
                /*
@@ -5426,12 +5447,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
        struct kvm_vcpu *vcpu;
        int i;
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
        atomic_set(&kvm_guest_has_master_clock, 0);
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
 }
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5945,10 +5966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = IN_GUEST_MODE;
+        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        /* We should set ->mode before check ->requests,
         * see the comment in make_all_cpus_request.
         */
-        smp_mb();
+        smp_mb__after_srcu_read_unlock();
        local_irq_disable();
@@ -5958,12 +5981,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                smp_wmb();
                local_irq_enable();
                preempt_enable();
+                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = 1;
                goto cancel_injection;
        }
-        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        if (req_immediate_exit)
                smp_send_reschedule(vcpu->cpu);
@@ -6688,7 +6710,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        if (r)
                return r;
        kvm_vcpu_reset(vcpu);
-        r = kvm_mmu_setup(vcpu);
+        kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
        return r;
@@ -6940,6 +6962,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        vcpu->arch.pv_time_enabled = false;
+        vcpu->arch.guest_supported_xcr0 = 0;
+        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
@@ -6981,6 +7007,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
@@ -7065,7 +7092,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
        int i;
@@ -7086,7 +7113,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
        }
 }
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                            unsigned long npages)
 {
        int i;
@@ -7283,7 +7311,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
        int r;
        if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
-              is_error_page(work->page))
+              work->wakeup_all)
                return;
        r = kvm_mmu_reload(vcpu);
@@ -7393,7 +7421,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
        struct x86_exception fault;
        trace_kvm_async_pf_ready(work->arch.token, work->gva);
-        if (is_error_page(work->page))
+        if (work->wakeup_all)
                work->arch.token = ~0; /* broadcast wakeup */
        else
                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
@@ -7420,6 +7448,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
                        kvm_x86_ops->interrupt_allowed(vcpu);
 }
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+        atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+        atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+        return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e224f7a671b6..587fb9ede436 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -122,6 +122,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
        struct x86_exception *exception);
+#define KVM_SUPPORTED_XCR0      (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
 extern u64 host_xcr0;
 extern struct static_key kvm_no_apic_vcpu;