Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM changes from Paolo Bonzini: "Here are the 3.13 KVM changes. There was a lot of work on the PPC side: the HV and emulation flavors can now coexist in a single kernel is probably the most interesting change from a user point of view. On the x86 side there are nested virtualization improvements and a few bugfixes. ARM got transparent huge page support, improved overcommit, and support for big endian guests. Finally, there is a new interface to connect KVM with VFIO. This helps with devices that use NoSnoop PCI transactions, letting the driver in the guest execute WBINVD instructions. This includes some nVidia cards on Windows, that fail to start without these patches and the corresponding userspace changes" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (146 commits) kvm, vmx: Fix lazy FPU on nested guest arm/arm64: KVM: PSCI: propagate caller endianness to the incoming vcpu arm/arm64: KVM: MMIO support for BE guest kvm, cpuid: Fix sparse warning kvm: Delete prototype for non-existent function kvm_check_iopl kvm: Delete prototype for non-existent function complete_pio hung_task: add method to reset detector pvclock: detect watchdog reset at pvclock read kvm: optimize out smp_mb after srcu_read_unlock srcu: API for barrier after srcu read unlock KVM: remove vm mmap method KVM: IOMMU: hva align mapping page size KVM: x86: trace cpuid emulation when called from emulator KVM: emulator: cleanup decode_register_operand() a bit KVM: emulator: check rex prefix inside decode_register() KVM: x86: fix emulation of "movzbl %bpl, %eax" kvm_host: typo fix KVM: x86: emulate SAHF instruction MAINTAINERS: add tree for kvm.git Documentation/kvm: add a 00-INDEX file ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-14 23:51:36 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-11-14 23:51:36 -0500
commit: f080480488028bcc25357f85e8ae54ccc3bb7173 (patch)
tree: 8fcc943f16d26c795b3b6324b478af2d5a30285d /arch/x86
parent: eda670c626a4f53eb8ac5f20d8c10d3f0b54c583 (diff)
parent: e504c9098ed6acd9e1079c5e10e4910724ad429f (diff)
18 files changed, 489 insertions, 214 deletions
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 15f960c06ff7..24ec1216596e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -274,13 +274,17 @@ struct x86_emulate_ctxt {
        bool guest_mode; /* guest running a nested guest */
        bool perm_ok; /* do not check permissions if true */
-        bool only_vendor_specific_insn;
+        bool ud;        /* inject an #UD if host doesn't support insn */
        bool have_exception;
        struct x86_exception exception;
-        /* decode cache */
+        /*
-        u8 twobyte;
+         * decode cache
+         */
+        /* current opcode length in bytes */
+        u8 opcode_len;
        u8 b;
        u8 intercept;
        u8 lock_prefix;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c76ff74a98f2..ae5d7830855c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,6 +79,13 @@
 #define KVM_HPAGE_MASK(x)       (~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)  (KVM_HPAGE_SIZE(x) / PAGE_SIZE)
+static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
+{
+        /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */
+        return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+                (base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+}
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
@@ -253,7 +260,6 @@ struct kvm_pio_request {
 * mode.
 */
 struct kvm_mmu {
-        void (*new_cr3)(struct kvm_vcpu *vcpu);
        void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
        unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
        u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
@@ -261,7 +267,6 @@ struct kvm_mmu {
                          bool prefault);
        void (*inject_page_fault)(struct kvm_vcpu *vcpu,
                                  struct x86_exception *fault);
-        void (*free)(struct kvm_vcpu *vcpu);
        gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
                            struct x86_exception *exception);
        gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
@@ -389,6 +394,8 @@ struct kvm_vcpu_arch {
        struct fpu guest_fpu;
        u64 xcr0;
+        u64 guest_supported_xcr0;
+        u32 guest_xstate_size;
        struct kvm_pio_request pio;
        void *pio_data;
@@ -557,7 +564,9 @@ struct kvm_arch {
        struct list_head assigned_dev_head;
        struct iommu_domain *iommu_domain;
-        int iommu_flags;
+        bool iommu_noncoherent;
+#define __KVM_HAVE_ARCH_NONCOHERENT_DMA
+        atomic_t noncoherent_dma_count;
        struct kvm_pic *vpic;
        struct kvm_ioapic *vioapic;
        struct kvm_pit *vpit;
@@ -780,11 +789,11 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
-int kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
                u64 dirty_mask, u64 nx_mask, u64 x_mask);
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
 void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
                                     struct kvm_memory_slot *slot,
@@ -922,13 +931,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
                       void *insn, int insn_len);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
-int complete_pio(struct kvm_vcpu *vcpu);
-bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
 {
        return gpa;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index be8269b00e2a..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -14,6 +14,8 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
                            struct timespec *ts);
 void pvclock_resume(void);
+void pvclock_touch_watchdogs(void);
 /*
 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 * yielding a 64-bit result.
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5d9a3033b3d7..d3a87780c70b 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -211,9 +211,9 @@ struct kvm_cpuid_entry2 {
        __u32 padding[3];
 };
-#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX         BIT(0)
-#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATEFUL_FUNC            BIT(1)
-#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+#define KVM_CPUID_FLAG_STATE_READ_NEXT          BIT(2)
 /* for KVM_SET_CPUID2 */
 struct kvm_cpuid2 {
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index bb0465090ae5..b93e09a0fa21 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -536,6 +536,7 @@
 /* MSR_IA32_VMX_MISC bits */
 #define MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS (1ULL << 29)
+#define MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE   0x1F
 /* AMD-V MSRs */
 #define MSR_VM_CR                       0xc0010114
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1570e0741344..e6041094ff26 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -139,6 +139,7 @@ bool kvm_check_and_clear_guest_paused(void)
        src = &hv_clock[cpu].pvti;
        if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
                src->flags &= ~PVCLOCK_GUEST_STOPPED;
+                pvclock_touch_watchdogs();
                ret = true;
        }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index a16bae3f83b3..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -43,6 +43,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
        return pv_tsc_khz;
 }
+void pvclock_touch_watchdogs(void)
+{
+        touch_softlockup_watchdog_sync();
+        clocksource_touch_watchdog();
+        rcu_cpu_stall_reset();
+        reset_hung_task_detector();
+}
 static atomic64_t last_value = ATOMIC64_INIT(0);
 void pvclock_resume(void)
@@ -74,6 +82,11 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
                version = __pvclock_read_cycles(src, &ret, &flags);
        } while ((src->version & 1) || version != src->version);
+        if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
+                src->flags &= ~PVCLOCK_GUEST_STOPPED;
+                pvclock_touch_watchdogs();
+        }
        if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
                (flags & PVCLOCK_TSC_STABLE_BIT))
                return ret;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a47a3e54b964..b89c5db2b832 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -38,6 +38,7 @@ config KVM
        select PERF_EVENTS
        select HAVE_KVM_MSI
        select HAVE_KVM_CPU_RELAX_INTERCEPT
+        select KVM_VFIO
        ---help---
          Support hosting fully virtualized guest machines using hardware
          virtualization extensions.  You will need a fairly recent
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index bf4fb04d0112..25d22b2d6509 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,7 @@ KVM := ../../../virt/kvm
 kvm-y                   += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
                                $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
-                                $(KVM)/eventfd.o $(KVM)/irqchip.o
+                                $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)     += $(KVM)/assigned-dev.o $(KVM)/iommu.o
 kvm-$(CONFIG_KVM_ASYNC_PF)      += $(KVM)/async_pf.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index b110fe6c03d4..c6976257eff5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -23,6 +23,26 @@
 #include "mmu.h"
 #include "trace.h"
+static u32 xstate_required_size(u64 xstate_bv)
+{
+        int feature_bit = 0;
+        u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+        xstate_bv &= ~XSTATE_FPSSE;
+        while (xstate_bv) {
+                if (xstate_bv & 0x1) {
+                        u32 eax, ebx, ecx, edx;
+                        cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
+                        ret = max(ret, eax + ebx);
+                }
+                xstate_bv >>= 1;
+                feature_bit++;
+        }
+        return ret;
+}
 void kvm_update_cpuid(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
@@ -46,6 +66,18 @@ void kvm_update_cpuid(struct kvm_vcpu *vcpu)
                        apic->lapic_timer.timer_mode_mask = 1 << 17;
        }
+        best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+        if (!best) {
+                vcpu->arch.guest_supported_xcr0 = 0;
+                vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+        } else {
+                vcpu->arch.guest_supported_xcr0 =
+                        (best->eax | ((u64)best->edx << 32)) &
+                        host_xcr0 & KVM_SUPPORTED_XCR0;
+                vcpu->arch.guest_xstate_size =
+                        xstate_required_size(vcpu->arch.guest_supported_xcr0);
+        }
        kvm_pmu_cpuid_update(vcpu);
 }
@@ -182,13 +214,35 @@ static bool supported_xcr0_bit(unsigned bit)
 {
        u64 mask = ((u64)1 << bit);
-        return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+        return mask & KVM_SUPPORTED_XCR0 & host_xcr0;
 }
 #define F(x) bit(X86_FEATURE_##x)
-static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
-                         u32 index, int *nent, int maxnent)
+                                   u32 func, u32 index, int *nent, int maxnent)
+{
+        switch (func) {
+        case 0:
+                entry->eax = 1;         /* only one leaf currently */
+                ++*nent;
+                break;
+        case 1:
+                entry->ecx = F(MOVBE);
+                ++*nent;
+                break;
+        default:
+                break;
+        }
+        entry->function = func;
+        entry->index = index;
+        return 0;
+}
+static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                                 u32 index, int *nent, int maxnent)
 {
        int r;
        unsigned f_nx = is_efer_nx() ? F(NX) : 0;
@@ -383,6 +437,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
        case 0xd: {
                int idx, i;
+                entry->eax &= host_xcr0 & KVM_SUPPORTED_XCR0;
+                entry->edx &= (host_xcr0 & KVM_SUPPORTED_XCR0) >> 32;
                entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
                for (idx = 1, i = 1; idx < 64; ++idx) {
                        if (*nent >= maxnent)
@@ -481,6 +537,15 @@ out:
        return r;
 }
+static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 func,
+                        u32 idx, int *nent, int maxnent, unsigned int type)
+{
+        if (type == KVM_GET_EMULATED_CPUID)
+                return __do_cpuid_ent_emulated(entry, func, idx, nent, maxnent);
+        return __do_cpuid_ent(entry, func, idx, nent, maxnent);
+}
 #undef F
 struct kvm_cpuid_param {
@@ -495,8 +560,36 @@ static bool is_centaur_cpu(const struct kvm_cpuid_param *param)
        return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR;
 }
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries,
-                                      struct kvm_cpuid_entry2 __user *entries)
+                                 __u32 num_entries, unsigned int ioctl_type)
+{
+        int i;
+        __u32 pad[3];
+        if (ioctl_type != KVM_GET_EMULATED_CPUID)
+                return false;
+        /*
+         * We want to make sure that ->padding is being passed clean from
+         * userspace in case we want to use it for something in the future.
+         *
+         * Sadly, this wasn't enforced for KVM_GET_SUPPORTED_CPUID and so we
+         * have to give ourselves satisfied only with the emulated side. /me
+         * sheds a tear.
+         */
+        for (i = 0; i < num_entries; i++) {
+                if (copy_from_user(pad, entries[i].padding, sizeof(pad)))
+                        return true;
+                if (pad[0] || pad[1] || pad[2])
+                        return true;
+        }
+        return false;
+}
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
+                            struct kvm_cpuid_entry2 __user *entries,
+                            unsigned int type)
 {
        struct kvm_cpuid_entry2 *cpuid_entries;
        int limit, nent = 0, r = -E2BIG, i;
@@ -513,8 +606,12 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                goto out;
        if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
                cpuid->nent = KVM_MAX_CPUID_ENTRIES;
+        if (sanity_check_entries(entries, cpuid->nent, type))
+                return -EINVAL;
        r = -ENOMEM;
-        cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+        cpuid_entries = vzalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
        if (!cpuid_entries)
                goto out;
@@ -526,7 +623,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                        continue;
                r = do_cpuid_ent(&cpuid_entries[nent], ent->func, ent->idx,
-                                &nent, cpuid->nent);
+                                &nent, cpuid->nent, type);
                if (r)
                        goto out_free;
@@ -537,7 +634,7 @@ int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
                limit = cpuid_entries[nent - 1].eax;
                for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func)
                        r = do_cpuid_ent(&cpuid_entries[nent], func, ent->idx,
-                                     &nent, cpuid->nent);
+                                     &nent, cpuid->nent, type);
                if (r)
                        goto out_free;
@@ -661,6 +758,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
                *edx = best->edx;
        } else
                *eax = *ebx = *ecx = *edx = 0;
+        trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx);
 }
 EXPORT_SYMBOL_GPL(kvm_cpuid);
@@ -676,6 +774,5 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
        kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
        kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
        kvm_x86_ops->skip_emulated_instruction(vcpu);
-        trace_kvm_cpuid(function, eax, ebx, ecx, edx);
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index b7fd07984888..f1e4895174b2 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -6,8 +6,9 @@
 void kvm_update_cpuid(struct kvm_vcpu *vcpu);
 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
                                              u32 function, u32 index);
-int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
+int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid,
-                                      struct kvm_cpuid_entry2 __user *entries);
+                            struct kvm_cpuid_entry2 __user *entries,
+                            unsigned int type);
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                             struct kvm_cpuid *cpuid,
                             struct kvm_cpuid_entry __user *entries);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ddc3f3d2afdb..07ffca0a89e9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -130,7 +130,7 @@
 #define Mov         (1<<20)
 /* Misc flags */
 #define Prot        (1<<21) /* instruction generates #UD if not in prot-mode */
-#define VendorSpecific (1<<22) /* Vendor specific instruction */
+#define EmulateOnUD (1<<22) /* Emulate if unsupported by the host */
 #define NoAccess    (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
 #define Op3264      (1<<24) /* Operand is 64b in long mode, 32b otherwise */
 #define Undefined   (1<<25) /* No Such Instruction */
@@ -785,9 +785,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 * @highbyte_regs specifies whether to decode AH,CH,DH,BH.
 */
 static void *decode_register(struct x86_emulate_ctxt *ctxt, u8 modrm_reg,
-                             int highbyte_regs)
+                             int byteop)
 {
        void *p;
+        int highbyte_regs = (ctxt->rex_prefix == 0) && byteop;
        if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8)
                p = (unsigned char *)reg_rmw(ctxt, modrm_reg & 3) + 1;
@@ -1024,7 +1025,6 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
                                    struct operand *op)
 {
        unsigned reg = ctxt->modrm_reg;
-        int highbyte_regs = ctxt->rex_prefix == 0;
        if (!(ctxt->d & ModRM))
                reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
@@ -1045,13 +1045,9 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
        }
        op->type = OP_REG;
-        if (ctxt->d & ByteOp) {
+        op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
-                op->addr.reg = decode_register(ctxt, reg, highbyte_regs);
+        op->addr.reg = decode_register(ctxt, reg, ctxt->d & ByteOp);
-                op->bytes = 1;
-        } else {
-                op->addr.reg = decode_register(ctxt, reg, 0);
-                op->bytes = ctxt->op_bytes;
-        }
        fetch_register_operand(op);
        op->orig_val = op->val;
 }
@@ -1082,12 +1078,10 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
        ctxt->modrm_seg = VCPU_SREG_DS;
        if (ctxt->modrm_mod == 3) {
-                int highbyte_regs = ctxt->rex_prefix == 0;
                op->type = OP_REG;
                op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
                op->addr.reg = decode_register(ctxt, ctxt->modrm_rm,
-                                               highbyte_regs && (ctxt->d & ByteOp));
+                                ctxt->d & ByteOp);
                if (ctxt->d & Sse) {
                        op->type = OP_XMM;
                        op->bytes = 16;
@@ -2961,6 +2955,46 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
+#define FFL(x) bit(X86_FEATURE_##x)
+static int em_movbe(struct x86_emulate_ctxt *ctxt)
+{
+        u32 ebx, ecx, edx, eax = 1;
+        u16 tmp;
+        /*
+         * Check MOVBE is set in the guest-visible CPUID leaf.
+         */
+        ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+        if (!(ecx & FFL(MOVBE)))
+                return emulate_ud(ctxt);
+        switch (ctxt->op_bytes) {
+        case 2:
+                /*
+                 * From MOVBE definition: "...When the operand size is 16 bits,
+                 * the upper word of the destination register remains unchanged
+                 * ..."
+                 *
+                 * Both casting ->valptr and ->val to u16 breaks strict aliasing
+                 * rules so we have to do the operation almost per hand.
+                 */
+                tmp = (u16)ctxt->src.val;
+                ctxt->dst.val &= ~0xffffUL;
+                ctxt->dst.val |= (unsigned long)swab16(tmp);
+                break;
+        case 4:
+                ctxt->dst.val = swab32((u32)ctxt->src.val);
+                break;
+        case 8:
+                ctxt->dst.val = swab64(ctxt->src.val);
+                break;
+        default:
+                return X86EMUL_PROPAGATE_FAULT;
+        }
+        return X86EMUL_CONTINUE;
+}
 static int em_cr_write(struct x86_emulate_ctxt *ctxt)
 {
        if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
@@ -3256,6 +3290,18 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt)
        return X86EMUL_CONTINUE;
 }
+static int em_sahf(struct x86_emulate_ctxt *ctxt)
+{
+        u32 flags;
+        flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+        flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
+        ctxt->eflags &= ~0xffUL;
+        ctxt->eflags |= flags | X86_EFLAGS_FIXED;
+        return X86EMUL_CONTINUE;
+}
 static int em_lahf(struct x86_emulate_ctxt *ctxt)
 {
        *reg_rmw(ctxt, VCPU_REGS_RAX) &= ~0xff00UL;
@@ -3502,7 +3548,7 @@ static const struct opcode group7_rm1[] = {
 static const struct opcode group7_rm3[] = {
        DIP(SrcNone | Prot | Priv,              vmrun,          check_svme_pa),
-        II(SrcNone  | Prot | VendorSpecific,    em_vmmcall,     vmmcall),
+        II(SrcNone  | Prot | EmulateOnUD,       em_vmmcall,     vmmcall),
        DIP(SrcNone | Prot | Priv,              vmload,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              vmsave,         check_svme_pa),
        DIP(SrcNone | Prot | Priv,              stgi,           check_svme),
@@ -3587,7 +3633,7 @@ static const struct group_dual group7 = { {
        II(SrcMem16 | Mov | Priv,               em_lmsw, lmsw),
        II(SrcMem | ByteOp | Priv | NoAccess,   em_invlpg, invlpg),
 }, {
-        I(SrcNone | Priv | VendorSpecific,      em_vmcall),
+        I(SrcNone | Priv | EmulateOnUD, em_vmcall),
        EXT(0, group7_rm1),
        N, EXT(0, group7_rm3),
        II(SrcNone | DstMem | Mov,              em_smsw, smsw), N,
@@ -3750,7 +3796,8 @@ static const struct opcode opcode_table[256] = {
        D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
        I(SrcImmFAddr | No64, em_call_far), N,
        II(ImplicitOps | Stack, em_pushf, pushf),
-        II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf),
+        II(ImplicitOps | Stack, em_popf, popf),
+        I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf),
        /* 0xA0 - 0xA7 */
        I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
        I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
@@ -3810,7 +3857,7 @@ static const struct opcode opcode_table[256] = {
 static const struct opcode twobyte_table[256] = {
        /* 0x00 - 0x0F */
        G(0, group6), GD(0, &group7), N, N,
-        N, I(ImplicitOps | VendorSpecific, em_syscall),
+        N, I(ImplicitOps | EmulateOnUD, em_syscall),
        II(ImplicitOps | Priv, em_clts, clts), N,
        DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
        N, D(ImplicitOps | ModRM), N, N,
@@ -3830,8 +3877,8 @@ static const struct opcode twobyte_table[256] = {
        IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
        II(ImplicitOps | Priv, em_rdmsr, rdmsr),
        IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc),
-        I(ImplicitOps | VendorSpecific, em_sysenter),
+        I(ImplicitOps | EmulateOnUD, em_sysenter),
-        I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
+        I(ImplicitOps | Priv | EmulateOnUD, em_sysexit),
        N, N,
        N, N, N, N, N, N, N, N,
        /* 0x40 - 0x4F */
@@ -3892,6 +3939,30 @@ static const struct opcode twobyte_table[256] = {
        N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
 };
+static const struct gprefix three_byte_0f_38_f0 = {
+        I(DstReg | SrcMem | Mov, em_movbe), N, N, N
+};
+static const struct gprefix three_byte_0f_38_f1 = {
+        I(DstMem | SrcReg | Mov, em_movbe), N, N, N
+};
+/*
+ * Insns below are selected by the prefix which indexed by the third opcode
+ * byte.
+ */
+static const struct opcode opcode_map_0f_38[256] = {
+        /* 0x00 - 0x7f */
+        X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+        /* 0x80 - 0xef */
+        X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
+        /* 0xf0 - 0xf1 */
+        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
+        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+        /* 0xf2 - 0xff */
+        N, N, X4(N), X8(N)
+};
 #undef D
 #undef N
 #undef G
@@ -4040,7 +4111,8 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
        case OpMem8:
                ctxt->memop.bytes = 1;
                if (ctxt->memop.type == OP_REG) {
-                        ctxt->memop.addr.reg = decode_register(ctxt, ctxt->modrm_rm, 1);
+                        ctxt->memop.addr.reg = decode_register(ctxt,
+                                        ctxt->modrm_rm, true);
                        fetch_register_operand(&ctxt->memop);
                }
                goto mem_common;
@@ -4126,6 +4198,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
        ctxt->_eip = ctxt->eip;
        ctxt->fetch.start = ctxt->_eip;
        ctxt->fetch.end = ctxt->fetch.start + insn_len;
+        ctxt->opcode_len = 1;
        if (insn_len > 0)
                memcpy(ctxt->fetch.data, insn, insn_len);
@@ -4208,9 +4281,16 @@ done_prefixes:
        opcode = opcode_table[ctxt->b];
        /* Two-byte opcode? */
        if (ctxt->b == 0x0f) {
-                ctxt->twobyte = 1;
+                ctxt->opcode_len = 2;
                ctxt->b = insn_fetch(u8, ctxt);
                opcode = twobyte_table[ctxt->b];
+                /* 0F_38 opcode map */
+                if (ctxt->b == 0x38) {
+                        ctxt->opcode_len = 3;
+                        ctxt->b = insn_fetch(u8, ctxt);
+                        opcode = opcode_map_0f_38[ctxt->b];
+                }
        }
        ctxt->d = opcode.flags;
@@ -4267,7 +4347,7 @@ done_prefixes:
        if (ctxt->d == 0 || (ctxt->d & NotImpl))
                return EMULATION_FAILED;
-        if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+        if (!(ctxt->d & EmulateOnUD) && ctxt->ud)
                return EMULATION_FAILED;
        if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
@@ -4540,8 +4620,10 @@ special_insn:
                goto writeback;
        }
-        if (ctxt->twobyte)
+        if (ctxt->opcode_len == 2)
                goto twobyte_insn;
+        else if (ctxt->opcode_len == 3)
+                goto threebyte_insn;
        switch (ctxt->b) {
        case 0x63:              /* movsxd */
@@ -4726,6 +4808,8 @@ twobyte_insn:
                goto cannot_emulate;
        }
+threebyte_insn:
        if (rc != X86EMUL_CONTINUE)
                goto done;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dce0df8150df..40772ef0f2b1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2570,11 +2570,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
        kvm_release_pfn_clean(pfn);
 }
-static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
-{
-        mmu_free_roots(vcpu);
-}
 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                     bool no_dirty_log)
 {
@@ -3424,18 +3419,11 @@ out_unlock:
        return 0;
 }
-static void nonpaging_free(struct kvm_vcpu *vcpu)
+static void nonpaging_init_context(struct kvm_vcpu *vcpu,
-{
+                                   struct kvm_mmu *context)
-        mmu_free_roots(vcpu);
-}
-static int nonpaging_init_context(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu *context)
 {
-        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = nonpaging_page_fault;
        context->gva_to_gpa = nonpaging_gva_to_gpa;
-        context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3444,7 +3432,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
        context->root_hpa = INVALID_PAGE;
        context->direct_map = true;
        context->nx = false;
-        return 0;
 }
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -3454,9 +3441,8 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_flush_tlb);
-static void paging_new_cr3(struct kvm_vcpu *vcpu)
+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
 {
-        pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
        mmu_free_roots(vcpu);
 }
@@ -3471,11 +3457,6 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
        vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
-static void paging_free(struct kvm_vcpu *vcpu)
-{
-        nonpaging_free(vcpu);
-}
 static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
                           unsigned access, int *nr_present)
 {
@@ -3665,9 +3646,9 @@ static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
        mmu->last_pte_bitmap = map;
 }
-static int paging64_init_context_common(struct kvm_vcpu *vcpu,
+static void paging64_init_context_common(struct kvm_vcpu *vcpu,
-                                        struct kvm_mmu *context,
+                                         struct kvm_mmu *context,
-                                        int level)
+                                         int level)
 {
        context->nx = is_nx(vcpu);
        context->root_level = level;
@@ -3677,27 +3658,24 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
        update_last_pte_bitmap(vcpu, context);
        ASSERT(is_pae(vcpu));
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging64_page_fault;
        context->gva_to_gpa = paging64_gva_to_gpa;
        context->sync_page = paging64_sync_page;
        context->invlpg = paging64_invlpg;
        context->update_pte = paging64_update_pte;
-        context->free = paging_free;
        context->shadow_root_level = level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-        return 0;
 }
-static int paging64_init_context(struct kvm_vcpu *vcpu,
+static void paging64_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+                                  struct kvm_mmu *context)
 {
-        return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
+        paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
 }
-static int paging32_init_context(struct kvm_vcpu *vcpu,
+static void paging32_init_context(struct kvm_vcpu *vcpu,
-                                 struct kvm_mmu *context)
+                                  struct kvm_mmu *context)
 {
        context->nx = false;
        context->root_level = PT32_ROOT_LEVEL;
@@ -3706,33 +3684,28 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = paging32_page_fault;
        context->gva_to_gpa = paging32_gva_to_gpa;
-        context->free = paging_free;
        context->sync_page = paging32_sync_page;
        context->invlpg = paging32_invlpg;
        context->update_pte = paging32_update_pte;
        context->shadow_root_level = PT32E_ROOT_LEVEL;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
-        return 0;
 }
-static int paging32E_init_context(struct kvm_vcpu *vcpu,
+static void paging32E_init_context(struct kvm_vcpu *vcpu,
-                                  struct kvm_mmu *context)
+                                   struct kvm_mmu *context)
 {
-        return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
+        paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
 }
-static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *context = vcpu->arch.walk_mmu;
        context->base_role.word = 0;
-        context->new_cr3 = nonpaging_new_cr3;
        context->page_fault = tdp_page_fault;
-        context->free = nonpaging_free;
        context->sync_page = nonpaging_sync_page;
        context->invlpg = nonpaging_invlpg;
        context->update_pte = nonpaging_update_pte;
@@ -3767,37 +3740,32 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
        update_permission_bitmask(vcpu, context, false);
        update_last_pte_bitmap(vcpu, context);
-        return 0;
 }
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
-        int r;
        bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
        if (!is_paging(vcpu))
-                r = nonpaging_init_context(vcpu, context);
+                nonpaging_init_context(vcpu, context);
        else if (is_long_mode(vcpu))
-                r = paging64_init_context(vcpu, context);
+                paging64_init_context(vcpu, context);
        else if (is_pae(vcpu))
-                r = paging32E_init_context(vcpu, context);
+                paging32E_init_context(vcpu, context);
        else
-                r = paging32_init_context(vcpu, context);
+                paging32_init_context(vcpu, context);
        vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
                = smep && !is_write_protection(vcpu);
-        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly)
 {
        ASSERT(vcpu);
@@ -3806,37 +3774,30 @@ int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
        context->nx = true;
-        context->new_cr3 = paging_new_cr3;
        context->page_fault = ept_page_fault;
        context->gva_to_gpa = ept_gva_to_gpa;
        context->sync_page = ept_sync_page;
        context->invlpg = ept_invlpg;
        context->update_pte = ept_update_pte;
-        context->free = paging_free;
        context->root_level = context->shadow_root_level;
        context->root_hpa = INVALID_PAGE;
        context->direct_map = false;
        update_permission_bitmask(vcpu, context, true);
        reset_rsvds_bits_mask_ept(vcpu, context, execonly);
-        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
-static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
 {
-        int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
+        kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
        vcpu->arch.walk_mmu->set_cr3           = kvm_x86_ops->set_cr3;
        vcpu->arch.walk_mmu->get_cr3           = get_cr3;
        vcpu->arch.walk_mmu->get_pdptr         = kvm_pdptr_read;
        vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
-        return r;
 }
-static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 {
        struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
@@ -3873,11 +3834,9 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
        update_permission_bitmask(vcpu, g_context, false);
        update_last_pte_bitmap(vcpu, g_context);
-        return 0;
 }
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 {
        if (mmu_is_nested(vcpu))
                return init_kvm_nested_mmu(vcpu);
@@ -3887,18 +3846,12 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
                return init_kvm_softmmu(vcpu);
 }
-static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
+void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-        if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
-                /* mmu.free() should set root_hpa = INVALID_PAGE */
-                vcpu->arch.mmu.free(vcpu);
-}
-int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
+        kvm_mmu_unload(vcpu);
-{
+        init_kvm_mmu(vcpu);
-        destroy_kvm_mmu(vcpu);
-        return init_kvm_mmu(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
@@ -3923,6 +3876,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 {
        mmu_free_roots(vcpu);
+        WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
@@ -4281,12 +4235,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
        return alloc_mmu_pages(vcpu);
 }
-int kvm_mmu_setup(struct kvm_vcpu *vcpu)
+void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
        ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
-        return init_kvm_mmu(vcpu);
+        init_kvm_mmu(vcpu);
 }
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
@@ -4428,7 +4382,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
        int nr_to_scan = sc->nr_to_scan;
        unsigned long freed = 0;
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                int idx;
@@ -4478,9 +4432,8 @@ unlock:
                break;
        }
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
        return freed;
 }
 static unsigned long
@@ -4574,7 +4527,7 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
        ASSERT(vcpu);
-        destroy_kvm_mmu(vcpu);
+        kvm_mmu_unload(vcpu);
        free_mmu_pages(vcpu);
        mmu_free_memory_caches(vcpu);
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 77e044a0f5f7..292615274358 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -70,8 +70,8 @@ enum {
 };
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
-int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
-int kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
                bool execonly);
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c0bc80391e40..c7168a5cff1b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1959,11 +1959,9 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
        nested_svm_vmexit(svm);
 }
-static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        int r;
+        kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
-        r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
        vcpu->arch.mmu.set_cr3           = nested_svm_set_tdp_cr3;
        vcpu->arch.mmu.get_cr3           = nested_svm_get_tdp_cr3;
@@ -1971,8 +1969,6 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
        vcpu->arch.mmu.shadow_root_level = get_npt_level();
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-        return r;
 }
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b2fce1b2009..b2fe1c252f35 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1498,7 +1498,7 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
                        break;
        if (i == NR_AUTOLOAD_MSRS) {
-                printk_once(KERN_WARNING"Not enough mst switch entries. "
+                printk_once(KERN_WARNING "Not enough msr switch entries. "
                                "Can't add msr %x\n", msr);
                return;
        } else if (i == m->nr) {
@@ -1898,16 +1898,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 /*
 * KVM wants to inject page-faults which it got to the guest. This function
 * checks whether in a nested guest, we need to inject them to L1 or L2.
- * This function assumes it is called with the exit reason in vmcs02 being
- * a #PF exception (this is the only case in which KVM injects a #PF when L2
- * is running).
 */
-static int nested_pf_handled(struct kvm_vcpu *vcpu)
+static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+        if (!(vmcs12->exception_bitmap & (1u << nr)))
-        if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR)))
                return 0;
        nested_vmx_vmexit(vcpu);
@@ -1921,8 +1917,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 intr_info = nr | INTR_INFO_VALID_MASK;
-        if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+        if (!reinject && is_guest_mode(vcpu) &&
-            !vmx->nested.nested_run_pending && nested_pf_handled(vcpu))
+            nested_vmx_check_exception(vcpu, nr))
                return;
        if (has_error_code) {
@@ -2204,9 +2200,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
 #ifdef CONFIG_X86_64
                VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+                VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+                VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+        if (!(nested_vmx_pinbased_ctls_high & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+            !(nested_vmx_exit_ctls_high & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)) {
+                nested_vmx_exit_ctls_high &= ~VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+                nested_vmx_pinbased_ctls_high &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+        }
        nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
-                                      VM_EXIT_LOAD_IA32_EFER);
+                VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER);
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2226,7 +2228,8 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
        nested_vmx_procbased_ctls_low = 0;
        nested_vmx_procbased_ctls_high &=
-                CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+                CPU_BASED_VIRTUAL_INTR_PENDING |
+                CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
                CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
                CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
                CPU_BASED_CR3_STORE_EXITING |
@@ -2252,13 +2255,15 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+                SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING;
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                         VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
+                         VMX_EPT_INVEPT_BIT;
                nested_vmx_ept_caps &= vmx_capability.ept;
                /*
                 * Since invept is completely emulated we support both global
@@ -3380,8 +3385,10 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        if (enable_ept) {
                eptp = construct_eptp(cr3);
                vmcs_write64(EPT_POINTER, eptp);
-                guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
+                if (is_paging(vcpu) || is_guest_mode(vcpu))
-                        vcpu->kvm->arch.ept_identity_map_addr;
+                        guest_cr3 = kvm_read_cr3(vcpu);
+                else
+                        guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
                ept_load_pdptrs(vcpu);
        }
@@ -4879,6 +4886,17 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
        hypercall[2] = 0xc1;
 }
+static bool nested_cr0_valid(struct vmcs12 *vmcs12, unsigned long val)
+{
+        unsigned long always_on = VMXON_CR0_ALWAYSON;
+        if (nested_vmx_secondary_ctls_high &
+                SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+            nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+                always_on &= ~(X86_CR0_PE | X86_CR0_PG);
+        return (val & always_on) == always_on;
+}
 /* called to set cr0 as appropriate for a mov-to-cr0 exit. */
 static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
 {
@@ -4897,9 +4915,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
                val = (val & ~vmcs12->cr0_guest_host_mask) |
                        (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
-                /* TODO: will have to take unrestricted guest mode into
+                if (!nested_cr0_valid(vmcs12, val))
-                 * account */
-                if ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)
                        return 1;
                if (kvm_set_cr0(vcpu, val))
@@ -6627,6 +6643,9 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                        return 0;
                else if (is_page_fault(intr_info))
                        return enable_ept;
+                else if (is_no_device(intr_info) &&
+                         !(nested_read_cr0(vmcs12) & X86_CR0_TS))
+                        return 0;
                return vmcs12->exception_bitmap &
                                (1u << (intr_info & INTR_INFO_VECTOR_MASK));
        case EXIT_REASON_EXTERNAL_INTERRUPT:
@@ -6722,6 +6741,27 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
+static void nested_adjust_preemption_timer(struct kvm_vcpu *vcpu)
+{
+        u64 delta_tsc_l1;
+        u32 preempt_val_l1, preempt_val_l2, preempt_scale;
+        if (!(get_vmcs12(vcpu)->pin_based_vm_exec_control &
+                        PIN_BASED_VMX_PREEMPTION_TIMER))
+                return;
+        preempt_scale = native_read_msr(MSR_IA32_VMX_MISC) &
+                        MSR_IA32_VMX_MISC_PREEMPTION_TIMER_SCALE;
+        preempt_val_l2 = vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
+        delta_tsc_l1 = vmx_read_l1_tsc(vcpu, native_read_tsc())
+                - vcpu->arch.last_guest_tsc;
+        preempt_val_l1 = delta_tsc_l1 >> preempt_scale;
+        if (preempt_val_l2 <= preempt_val_l1)
+                preempt_val_l2 = 0;
+        else
+                preempt_val_l2 -= preempt_val_l1;
+        vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, preempt_val_l2);
+}
 /*
 * The guest has exited.  See if we can fix it or if we need userspace
 * assistance.
@@ -6736,20 +6776,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        if (vmx->emulation_required)
                return handle_invalid_guest_state(vcpu);
-        /*
-         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
-         * we did not inject a still-pending event to L1 now because of
-         * nested_run_pending, we need to re-enable this bit.
-         */
-        if (vmx->nested.nested_run_pending)
-                kvm_make_request(KVM_REQ_EVENT, vcpu);
-        if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
-            exit_reason == EXIT_REASON_VMRESUME))
-                vmx->nested.nested_run_pending = 1;
-        else
-                vmx->nested.nested_run_pending = 0;
        if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
                nested_vmx_vmexit(vcpu);
                return 1;
@@ -7061,9 +7087,9 @@ static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
        case INTR_TYPE_HARD_EXCEPTION:
                if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
                        u32 err = vmcs_read32(error_code_field);
-                        kvm_queue_exception_e(vcpu, vector, err);
+                        kvm_requeue_exception_e(vcpu, vector, err);
                } else
-                        kvm_queue_exception(vcpu, vector);
+                        kvm_requeue_exception(vcpu, vector);
                break;
        case INTR_TYPE_SOFT_INTR:
                vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
@@ -7146,6 +7172,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        atomic_switch_perf_msrs(vmx);
        debugctlmsr = get_debugctlmsr();
+        if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending)
+                nested_adjust_preemption_timer(vcpu);
        vmx->__launched = vmx->loaded_vmcs->launched;
        asm(
                /* Store host registers */
@@ -7284,6 +7312,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
        trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
+        /*
+         * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+         * we did not inject a still-pending event to L1 now because of
+         * nested_run_pending, we need to re-enable this bit.
+         */
+        if (vmx->nested.nested_run_pending)
+                kvm_make_request(KVM_REQ_EVENT, vcpu);
+        vmx->nested.nested_run_pending = 0;
        vmx_complete_atomic_exit(vmx);
        vmx_recover_nmi_blocking(vmx);
        vmx_complete_interrupts(vmx);
@@ -7410,8 +7448,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
         */
        if (is_mmio)
                ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-        else if (vcpu->kvm->arch.iommu_domain &&
+        else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
-                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
                ret = kvm_get_guest_memory_type(vcpu, gfn) <<
                      VMX_EPT_MT_EPTE_SHIFT;
        else
@@ -7501,9 +7538,9 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
-static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
-        int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+        kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
                        nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
@@ -7511,8 +7548,6 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
-        return r;
 }
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -7520,6 +7555,20 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
+static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
+                struct x86_exception *fault)
+{
+        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+        WARN_ON(!is_guest_mode(vcpu));
+        /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+        if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+                nested_vmx_vmexit(vcpu);
+        else
+                kvm_inject_page_fault(vcpu, fault);
+}
 /*
 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7533,6 +7582,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
+        u32 exit_control;
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -7706,7 +7756,10 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
         * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
         * bits are further modified by vmx_set_efer() below.
         */
-        vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+        exit_control = vmcs_config.vmexit_ctrl;
+        if (vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER)
+                exit_control |= VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
+        vmcs_write32(VM_EXIT_CONTROLS, exit_control);
        /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
         * emulated by vmx_set_efer(), below.
@@ -7773,6 +7826,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
+        if (!enable_ept)
+                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
        /*
         * L1 may access the L2's PDPTR, so save them to construct vmcs12
         */
@@ -7876,7 +7932,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
-        if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+        if (!nested_cr0_valid(vmcs12, vmcs12->guest_cr0) ||
            ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
                nested_vmx_entry_failure(vcpu, vmcs12,
                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
@@ -7938,6 +7994,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        enter_guest_mode(vcpu);
+        vmx->nested.nested_run_pending = 1;
        vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
        cpu = get_cpu();
@@ -8005,7 +8063,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
        u32 idt_vectoring;
        unsigned int nr;
-        if (vcpu->arch.exception.pending) {
+        if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
                nr = vcpu->arch.exception.nr;
                idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -8023,7 +8081,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
                }
                vmcs12->idt_vectoring_info_field = idt_vectoring;
-        } else if (vcpu->arch.nmi_pending) {
+        } else if (vcpu->arch.nmi_injected) {
                vmcs12->idt_vectoring_info_field =
                        INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
        } else if (vcpu->arch.interrupt.pending) {
@@ -8105,6 +8163,11 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+        if ((vmcs12->pin_based_vm_exec_control & PIN_BASED_VMX_PREEMPTION_TIMER) &&
+            (vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+                vmcs12->vmx_preemption_timer_value =
+                        vmcs_read32(VMX_PREEMPTION_TIMER_VALUE);
        /*
         * In some cases (usually, nested EPT), L2 is allowed to change its
         * own CR3 without exiting. If it has changed it, we must keep it.
@@ -8130,6 +8193,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
                vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+        if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
+                vmcs12->guest_ia32_efer = vcpu->arch.efer;
        vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
        vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
@@ -8201,7 +8266,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
         * fpu_active (which may have changed).
         * Note that vmx_set_cr0 refers to efer set above.
         */
-        kvm_set_cr0(vcpu, vmcs12->host_cr0);
+        vmx_set_cr0(vcpu, vmcs12->host_cr0);
        /*
         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
@@ -8224,6 +8289,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
+        if (!enable_ept)
+                vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
        if (enable_vpid) {
                /*
                 * Trivially support vpid by letting L2s share their parent
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5ca72a5cdb6..21ef1ba184ae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -577,6 +577,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
 int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
        u64 xcr0;
+        u64 valid_bits;
        /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
        if (index != XCR_XFEATURE_ENABLED_MASK)
@@ -586,8 +587,16 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
                return 1;
        if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
                return 1;
-        if (xcr0 & ~host_xcr0)
+        /*
+         * Do not allow the guest to set bits that we do not support
+         * saving.  However, xcr0 bit 0 is always set, even if the
+         * emulated CPU does not support XSAVE (see fx_init).
+         */
+        valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
+        if (xcr0 & ~valid_bits)
                return 1;
        kvm_put_guest_xcr0(vcpu);
        vcpu->arch.xcr0 = xcr0;
        return 0;
@@ -684,7 +693,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
        vcpu->arch.cr3 = cr3;
        __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
-        vcpu->arch.mmu.new_cr3(vcpu);
+        kvm_mmu_new_cr3(vcpu);
        return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
@@ -2564,6 +2573,7 @@ int kvm_dev_ioctl_check_extension(long ext)
        case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
        case KVM_CAP_SET_TSS_ADDR:
        case KVM_CAP_EXT_CPUID:
+        case KVM_CAP_EXT_EMUL_CPUID:
        case KVM_CAP_CLOCKSOURCE:
        case KVM_CAP_PIT:
        case KVM_CAP_NOP_IO_DELAY:
@@ -2673,15 +2683,17 @@ long kvm_arch_dev_ioctl(struct file *filp,
                r = 0;
                break;
        }
-        case KVM_GET_SUPPORTED_CPUID: {
+        case KVM_GET_SUPPORTED_CPUID:
+        case KVM_GET_EMULATED_CPUID: {
                struct kvm_cpuid2 __user *cpuid_arg = argp;
                struct kvm_cpuid2 cpuid;
                r = -EFAULT;
                if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
                        goto out;
-                r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
-                                                      cpuid_arg->entries);
+                r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
+                                            ioctl);
                if (r)
                        goto out;
@@ -2715,8 +2727,7 @@ static void wbinvd_ipi(void *garbage)
 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
-        return vcpu->kvm->arch.iommu_domain &&
+        return kvm_arch_has_noncoherent_dma(vcpu->kvm);
-                !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
 }
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2984,11 +2995,13 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
                                         struct kvm_xsave *guest_xsave)
 {
-        if (cpu_has_xsave)
+        if (cpu_has_xsave) {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->xsave,
-                        xstate_size);
+                        vcpu->arch.guest_xstate_size);
-        else {
+                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
+                        vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+        } else {
                memcpy(guest_xsave->region,
                        &vcpu->arch.guest_fpu.state->fxsave,
                        sizeof(struct i387_fxsave_struct));
@@ -3003,10 +3016,19 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
        u64 xstate_bv =
                *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
-        if (cpu_has_xsave)
+        if (cpu_has_xsave) {
+                /*
+                 * Here we allow setting states that are not present in
+                 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
+                 * with old userspace.
+                 */
+                if (xstate_bv & ~KVM_SUPPORTED_XCR0)
+                        return -EINVAL;
+                if (xstate_bv & ~host_xcr0)
+                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->xsave,
-                        guest_xsave->region, xstate_size);
+                        guest_xsave->region, vcpu->arch.guest_xstate_size);
-        else {
+        } else {
                if (xstate_bv & ~XSTATE_FPSSE)
                        return -EINVAL;
                memcpy(&vcpu->arch.guest_fpu.state->fxsave,
@@ -3042,9 +3064,9 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
        for (i = 0; i < guest_xcrs->nr_xcrs; i++)
                /* Only support XCR0 currently */
-                if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+                if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
                        r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
-                                guest_xcrs->xcrs[0].value);
+                                guest_xcrs->xcrs[i].value);
                        break;
                }
        if (r)
@@ -4775,8 +4797,8 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 static void init_decode_cache(struct x86_emulate_ctxt *ctxt)
 {
-        memset(&ctxt->twobyte, 0,
+        memset(&ctxt->opcode_len, 0,
-               (void *)&ctxt->_regs - (void *)&ctxt->twobyte);
+               (void *)&ctxt->_regs - (void *)&ctxt->opcode_len);
        ctxt->fetch.start = 0;
        ctxt->fetch.end = 0;
@@ -5094,8 +5116,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                ctxt->have_exception = false;
                ctxt->perm_ok = false;
-                ctxt->only_vendor_specific_insn
+                ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-                        = emulation_type & EMULTYPE_TRAP_UD;
                r = x86_decode_insn(ctxt, insn, insn_len);
@@ -5263,7 +5284,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
        smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        if (vcpu->cpu != freq->cpu)
@@ -5273,7 +5294,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
                                send_ipi = 1;
                }
        }
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
        if (freq->old < freq->new && send_ipi) {
                /*
@@ -5426,12 +5447,12 @@ static void pvclock_gtod_update_fn(struct work_struct *work)
        struct kvm_vcpu *vcpu;
        int i;
-        raw_spin_lock(&kvm_lock);
+        spin_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
        atomic_set(&kvm_guest_has_master_clock, 0);
-        raw_spin_unlock(&kvm_lock);
+        spin_unlock(&kvm_lock);
 }
 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
@@ -5945,10 +5966,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        vcpu->mode = IN_GUEST_MODE;
+        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        /* We should set ->mode before check ->requests,
         * see the comment in make_all_cpus_request.
         */
-        smp_mb();
+        smp_mb__after_srcu_read_unlock();
        local_irq_disable();
@@ -5958,12 +5981,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                smp_wmb();
                local_irq_enable();
                preempt_enable();
+                vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
                r = 1;
                goto cancel_injection;
        }
-        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        if (req_immediate_exit)
                smp_send_reschedule(vcpu->cpu);
@@ -6688,7 +6710,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
        if (r)
                return r;
        kvm_vcpu_reset(vcpu);
-        r = kvm_mmu_setup(vcpu);
+        kvm_mmu_setup(vcpu);
        vcpu_put(vcpu);
        return r;
@@ -6940,6 +6962,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
        vcpu->arch.ia32_tsc_adjust_msr = 0x0;
        vcpu->arch.pv_time_enabled = false;
+        vcpu->arch.guest_supported_xcr0 = 0;
+        vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
        kvm_async_pf_hash_reset(vcpu);
        kvm_pmu_init(vcpu);
@@ -6981,6 +7007,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
        INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
        INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
        INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
+        atomic_set(&kvm->arch.noncoherent_dma_count, 0);
        /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
        set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
@@ -7065,7 +7092,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
 }
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
                           struct kvm_memory_slot *dont)
 {
        int i;
@@ -7086,7 +7113,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
        }
 }
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+                            unsigned long npages)
 {
        int i;
@@ -7283,7 +7311,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
        int r;
        if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
-              is_error_page(work->page))
+              work->wakeup_all)
                return;
        r = kvm_mmu_reload(vcpu);
@@ -7393,7 +7421,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
        struct x86_exception fault;
        trace_kvm_async_pf_ready(work->arch.token, work->gva);
-        if (is_error_page(work->page))
+        if (work->wakeup_all)
                work->arch.token = ~0; /* broadcast wakeup */
        else
                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
@@ -7420,6 +7448,24 @@ bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
                        kvm_x86_ops->interrupt_allowed(vcpu);
 }
+void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
+{
+        atomic_inc(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
+void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
+{
+        atomic_dec(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
+bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
+{
+        return atomic_read(&kvm->arch.noncoherent_dma_count);
+}
+EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e224f7a671b6..587fb9ede436 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -122,6 +122,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
        gva_t addr, void *val, unsigned int bytes,
        struct x86_exception *exception);
+#define KVM_SUPPORTED_XCR0      (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
 extern u64 host_xcr0;
 extern struct static_key kvm_no_apic_vcpu;
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-14 23:51:36 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-11-14 23:51:36 -0500
commit	f080480488028bcc25357f85e8ae54ccc3bb7173 (patch)
tree	8fcc943f16d26c795b3b6324b478af2d5a30285d /arch/x86
parent	eda670c626a4f53eb8ac5f20d8c10d3f0b54c583 (diff)
parent	e504c9098ed6acd9e1079c5e10e4910724ad429f (diff)