16 files changed, 572 insertions, 594 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index e4f2cdcf78eb..069450938b79 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3389,7 +3389,18 @@ struct kvm_run {
 Request that KVM_RUN return when it becomes possible to inject external
 interrupts into the guest.  Useful in conjunction with KVM_INTERRUPT.
-        __u8 padding1[7];
+        __u8 immediate_exit;
+This field is polled once when KVM_RUN starts; if non-zero, KVM_RUN
+exits immediately, returning -EINTR.  In the common scenario where a
+signal is used to "kick" a VCPU out of KVM_RUN, this field can be used
+to avoid usage of KVM_SET_SIGNAL_MASK, which has worse scalability.
+Rather than blocking the signal outside KVM_RUN, userspace can set up
+a signal handler that sets run->immediate_exit to a non-zero value.
+This field is ignored if KVM_CAP_IMMEDIATE_EXIT is not available.
+        __u8 padding1[6];
        /* out */
        __u32 exit_reason;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 21c493a9e5c9..c9a2103faeb9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -206,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_MP_STATE:
+        case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                        return ret;
        }
+        if (run->immediate_exit)
+                return -EINTR;
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 31ee5ee0010b..ed81e5ac1426 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -397,7 +397,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-        int r = 0;
+        int r = -EINTR;
        sigset_t sigsaved;
        if (vcpu->sigset_active)
@@ -409,6 +409,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
                vcpu->mmio_needed = 0;
        }
+        if (run->immediate_exit)
+                goto out;
        lose_fpu(1);
        local_irq_disable();
@@ -429,6 +432,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
        guest_exit_irqoff();
        local_irq_enable();
+out:
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -1021,6 +1025,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ENABLE_CAP:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_SYNC_MMU:
+        case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_COALESCED_MMIO:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index fcb253ba51e5..2b38d824e9e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_ONE_REG:
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_DEVICE_CTRL:
+        case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
        case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -1118,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 #endif
        }
-        r = kvmppc_vcpu_run(run, vcpu);
+        if (run->immediate_exit)
+                r = -EINTR;
+        else
+                r = kvmppc_vcpu_run(run, vcpu);
        if (vcpu->sigset_active)
                sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 502de74ea984..99e35fe0dea8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -370,6 +370,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_S390_IRQCHIP:
        case KVM_CAP_VM_ATTRIBUTES:
        case KVM_CAP_MP_STATE:
+        case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_S390_INJECT_IRQ:
        case KVM_CAP_S390_USER_SIGP:
        case KVM_CAP_S390_USER_STSI:
@@ -2798,6 +2799,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        int rc;
        sigset_t sigsaved;
+        if (kvm_run->immediate_exit)
+                return -EINTR;
        if (guestdbg_exit_pending(vcpu)) {
                kvm_s390_prepare_debug_exit(vcpu);
                return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 417502cf42b6..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
 #define KVM_REQ_TRIPLE_FAULT      10
 #define KVM_REQ_MMU_SYNC          11
 #define KVM_REQ_CLOCK_UPDATE      12
-#define KVM_REQ_DEACTIVATE_FPU    13
 #define KVM_REQ_EVENT             14
 #define KVM_REQ_APF_HALT          15
 #define KVM_REQ_STEAL_UPDATE      16
@@ -936,8 +935,6 @@ struct kvm_x86_ops {
        unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
        void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
        u32 (*get_pkru)(struct kvm_vcpu *vcpu);
-        void (*fpu_activate)(struct kvm_vcpu *vcpu);
-        void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
        void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -969,7 +966,7 @@ struct kvm_x86_ops {
        void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
        void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
        void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
-        void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+        int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
        int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
        int (*get_tdp_level)(void);
        u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c0e2036217ad..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
        if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
                best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
-        kvm_x86_ops->fpu_activate(vcpu);
        /*
         * The existing code assumes virtual address is 48-bit in the canonical
         * address checks; exit if it is ever changed.
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 33b799fd3a6e..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -341,7 +341,7 @@ static int find_highest_vector(void *bitmap)
             vec >= 0; vec -= APIC_VECTORS_PER_REG) {
                reg = bitmap + REG_POS(vec);
                if (*reg)
-                        return fls(*reg) - 1 + vec;
+                        return __fls(*reg) + vec;
        }
        return -1;
@@ -361,27 +361,32 @@ static u8 count_vectors(void *bitmap)
        return count;
 }
-void __kvm_apic_update_irr(u32 *pir, void *regs)
+int __kvm_apic_update_irr(u32 *pir, void *regs)
 {
-        u32 i, pir_val;
+        u32 i, vec;
+        u32 pir_val, irr_val;
+        int max_irr = -1;
-        for (i = 0; i <= 7; i++) {
+        for (i = vec = 0; i <= 7; i++, vec += 32) {
                pir_val = READ_ONCE(pir[i]);
+                irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
                if (pir_val) {
-                        pir_val = xchg(&pir[i], 0);
+                        irr_val |= xchg(&pir[i], 0);
-                        *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+                        *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
                }
+                if (irr_val)
+                        max_irr = __fls(irr_val) + vec;
        }
+        return max_irr;
 }
 EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 {
        struct kvm_lapic *apic = vcpu->arch.apic;
-        __kvm_apic_update_irr(pir, apic->regs);
+        return __kvm_apic_update_irr(pir, apic->regs);
-        kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -401,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
        if (!apic->irr_pending)
                return -1;
-        if (apic->vcpu->arch.apicv_active)
-                kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
        result = apic_search_irr(apic);
        ASSERT(result == -1 || result >= 16);
@@ -416,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
        vcpu = apic->vcpu;
        if (unlikely(vcpu->arch.apicv_active)) {
-                /* try to update RVI */
+                /* need to update RVI */
                apic_clear_vector(vec, apic->regs + APIC_IRR);
-                kvm_make_request(KVM_REQ_EVENT, vcpu);
+                kvm_x86_ops->hwapic_irr_update(vcpu,
+                                apic_find_highest_irr(apic));
        } else {
                apic->irr_pending = false;
                apic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -508,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
         */
        return apic_find_highest_irr(vcpu->arch.apic);
 }
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
                             int vector, int level, int trig_mode,
@@ -524,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
 {
+        return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
-        return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
+                                           sizeof(val));
-                                      sizeof(val));
 }
 static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
 {
+        return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
-        return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
+                                          sizeof(*val));
-                                      sizeof(*val));
 }
 static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -572,7 +575,11 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
 {
-        int highest_irr = apic_find_highest_irr(apic);
+        int highest_irr;
+        if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+                highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+        else
+                highest_irr = apic_find_highest_irr(apic);
        if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
                return -1;
        return highest_irr;
@@ -2204,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
                                1 : count_vectors(apic->regs + APIC_ISR);
        apic->highest_isr_cache = -1;
        if (vcpu->arch.apicv_active) {
-                if (kvm_x86_ops->apicv_post_state_restore)
+                kvm_x86_ops->apicv_post_state_restore(vcpu);
-                        kvm_x86_ops->apicv_post_state_restore(vcpu);
                kvm_x86_ops->hwapic_irr_update(vcpu,
                                apic_find_highest_irr(apic));
                kvm_x86_ops->hwapic_isr_update(vcpu,
@@ -2279,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
        if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
                return;
-        if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+        if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-                                  sizeof(u32)))
+                                       sizeof(u32)))
                return;
        apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2332,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
                max_isr = 0;
        data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
-        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
+        kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
-                                sizeof(u32));
+                                    sizeof(u32));
 }
 int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
 {
        if (vapic_addr) {
-                if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
                                        &vcpu->arch.apic->vapic_cache,
                                        vapic_addr, sizeof(u32)))
                        return -EINVAL;
@@ -2433,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
        vcpu->arch.pv_eoi.msr_val = data;
        if (!pv_eoi_enabled(vcpu))
                return 0;
-        return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+        return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
                                         addr, sizeof(u8));
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 05abd837b78a..bcbe811f3b97 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -71,8 +71,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
                           int short_hand, unsigned int dest, int dest_mode);
-void __kvm_apic_update_irr(u32 *pir, void *regs);
+int __kvm_apic_update_irr(u32 *pir, void *regs);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
                     struct dest_map *dest_map);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d0414f054bdf..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
 * a particular vCPU.
 */
 #define SVM_VM_DATA_HASH_BITS   8
-DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static spinlock_t svm_vm_data_hash_lock;
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 /* Note:
 * This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
                } else {
                        pr_info("AVIC enabled\n");
-                        hash_init(svm_vm_data_hash);
-                        spin_lock_init(&svm_vm_data_hash_lock);
                        amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
                }
        }
@@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
        struct vmcb_control_area *control = &svm->vmcb->control;
        struct vmcb_save_area *save = &svm->vmcb->save;
-        svm->vcpu.fpu_active = 1;
        svm->vcpu.arch.hflags = 0;
        set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
        ulong gcr0 = svm->vcpu.arch.cr0;
        u64 *hcr0 = &svm->vmcb->save.cr0;
-        if (!svm->vcpu.fpu_active)
+        *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-                *hcr0 |= SVM_CR0_SELECTIVE_MASK;
+                | (gcr0 & SVM_CR0_SELECTIVE_MASK);
-        else
-                *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
-                        | (gcr0 & SVM_CR0_SELECTIVE_MASK);
        mark_dirty(svm->vmcb, VMCB_CR);
-        if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+        if (gcr0 == *hcr0) {
                clr_cr_intercept(svm, INTERCEPT_CR0_READ);
                clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
        } else {
@@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        if (!npt_enabled)
                cr0 |= X86_CR0_PG | X86_CR0_WP;
-        if (!vcpu->fpu_active)
-                cr0 |= X86_CR0_TS;
        /*
         * re-enable caching here because the QEMU bios
         * does not do it - this results in some delay at
@@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
        return 1;
 }
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
-        struct vcpu_svm *svm = to_svm(vcpu);
-        clr_exception_intercept(svm, NM_VECTOR);
-        svm->vcpu.fpu_active = 1;
-        update_cr0_intercept(svm);
-}
-static int nm_interception(struct vcpu_svm *svm)
-{
-        svm_fpu_activate(&svm->vcpu);
-        return 1;
-}
 static bool is_erratum_383(void)
 {
        int err, i;
@@ -2573,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
                if (!npt_enabled && svm->apf_reason == 0)
                        return NESTED_EXIT_HOST;
                break;
-        case SVM_EXIT_EXCP_BASE + NM_VECTOR:
-                nm_interception(svm);
-                break;
        default:
                break;
        }
@@ -4020,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
        [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
        [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
-        [SVM_EXIT_EXCP_BASE + NM_VECTOR]        = nm_interception,
        [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
        [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
        [SVM_EXIT_INTR]                         = intr_interception,
@@ -4359,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
        return;
 }
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-        return;
-}
 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
 {
        kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5079,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
        return true;
 }
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-        struct vcpu_svm *svm = to_svm(vcpu);
-        set_exception_intercept(svm, NM_VECTOR);
-        update_cr0_intercept(svm);
-}
 #define PRE_EX(exit)  { .exit_code = (exit), \
                        .stage = X86_ICPT_PRE_EXCEPT, }
 #define POST_EX(exit) { .exit_code = (exit), \
@@ -5347,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .get_pkru = svm_get_pkru,
-        .fpu_activate = svm_fpu_activate,
-        .fpu_deactivate = svm_fpu_deactivate,
        .tlb_flush = svm_flush_tlb,
        .run = svm_vcpu_run,
@@ -5373,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .get_enable_apicv = svm_get_enable_apicv,
        .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = svm_load_eoi_exitmap,
-        .sync_pir_to_irr = svm_sync_pir_to_irr,
        .hwapic_irr_update = svm_hwapic_irr_update,
        .hwapic_isr_update = svm_hwapic_isr_update,
        .apicv_post_state_restore = avic_post_state_restore,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7c3e42623090..9856b73a21ad 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        u32 eb;
        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-             (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+             (1u << DB_VECTOR) | (1u << AC_VECTOR);
        if ((vcpu->guest_debug &
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
                eb = ~0;
        if (enable_ept)
                eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
-        if (vcpu->fpu_active)
-                eb &= ~(1u << NM_VECTOR);
        /* When we are running a nested L2 guest and L1 specified for it a
         * certain exception bitmap, we must trap the same exceptions and pass
@@ -2340,25 +2338,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
        }
 }
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
-        ulong cr0;
-        if (vcpu->fpu_active)
-                return;
-        vcpu->fpu_active = 1;
-        cr0 = vmcs_readl(GUEST_CR0);
-        cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
-        cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
-        vmcs_writel(GUEST_CR0, cr0);
-        update_exception_bitmap(vcpu);
-        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
-        if (is_guest_mode(vcpu))
-                vcpu->arch.cr0_guest_owned_bits &=
-                        ~get_vmcs12(vcpu)->cr0_guest_host_mask;
-        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 /*
@@ -2377,33 +2356,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
                (fields->cr4_read_shadow & fields->cr4_guest_host_mask);
 }
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
-        /* Note that there is no vcpu->fpu_active = 0 here. The caller must
-         * set this *before* calling this function.
-         */
-        vmx_decache_cr0_guest_bits(vcpu);
-        vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
-        update_exception_bitmap(vcpu);
-        vcpu->arch.cr0_guest_owned_bits = 0;
-        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-        if (is_guest_mode(vcpu)) {
-                /*
-                 * L1's specified read shadow might not contain the TS bit,
-                 * so now that we turned on shadowing of this bit, we need to
-                 * set this bit of the shadow. Like in nested_vmx_run we need
-                 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
-                 * up-to-date here because we just decached cr0.TS (and we'll
-                 * only update vmcs12->guest_cr0 on nested exit).
-                 */
-                struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-                vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
-                        (vcpu->arch.cr0 & X86_CR0_TS);
-                vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
-        } else
-                vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 {
        unsigned long rflags, save_rflags;
@@ -4232,9 +4184,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
        if (enable_ept)
                ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
-        if (!vcpu->fpu_active)
-                hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
        vmcs_writel(CR0_READ_SHADOW, cr0);
        vmcs_writel(GUEST_CR0, hw_cr0);
        vcpu->arch.cr0 = cr0;
@@ -5051,26 +5000,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
        if (pi_test_and_set_pir(vector, &vmx->pi_desc))
                return;
-        r = pi_test_and_set_on(&vmx->pi_desc);
+        /* If a previous notification has sent the IPI, nothing to do.  */
-        kvm_make_request(KVM_REQ_EVENT, vcpu);
+        if (pi_test_and_set_on(&vmx->pi_desc))
-        if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
-                kvm_vcpu_kick(vcpu);
-}
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
-        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        if (!pi_test_on(&vmx->pi_desc))
                return;
-        pi_clear_on(&vmx->pi_desc);
+        if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
-        /*
+                kvm_vcpu_kick(vcpu);
-         * IOMMU can write to PIR.ON, so the barrier matters even on UP.
-         * But on x86 this is just a compiler barrier anyway.
-         */
-        smp_mb__after_atomic();
-        kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
 }
 /*
@@ -5335,7 +5270,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        /* 22.2.1, 20.8.1 */
        vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
-        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+        vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+        vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
        set_cr4_guest_host_mask(vmx);
        if (vmx_xsaves_supported())
@@ -5439,7 +5376,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx_set_cr0(vcpu, cr0); /* enter rmode */
        vmx_set_cr4(vcpu, 0);
        vmx_set_efer(vcpu, 0);
-        vmx_fpu_activate(vcpu);
        update_exception_bitmap(vcpu);
        vpid_sync_context(vmx->vpid);
@@ -5473,26 +5410,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-        u32 cpu_based_vm_exec_control;
+        vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+                      CPU_BASED_VIRTUAL_INTR_PENDING);
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-        u32 cpu_based_vm_exec_control;
        if (!cpu_has_virtual_nmis() ||
            vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                enable_irq_window(vcpu);
                return;
        }
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+        vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
-        cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+                      CPU_BASED_VIRTUAL_NMI_PENDING);
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -5718,11 +5649,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_nmi(intr_info))
                return 1;  /* already handled by vmx_vcpu_run() */
-        if (is_no_device(intr_info)) {
-                vmx_fpu_activate(vcpu);
-                return 1;
-        }
        if (is_invalid_opcode(intr_info)) {
                if (is_guest_mode(vcpu)) {
                        kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5912,22 +5838,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
                return kvm_set_cr4(vcpu, val);
 }
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
-        if (is_guest_mode(vcpu)) {
-                /*
-                 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
-                 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
-                 * just pretend it's off (also in arch.cr0 for fpu_activate).
-                 */
-                vmcs_writel(CR0_READ_SHADOW,
-                        vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
-                vcpu->arch.cr0 &= ~X86_CR0_TS;
-        } else
-                vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification, val;
@@ -5973,9 +5883,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                }
                break;
        case 2: /* clts */
-                handle_clts(vcpu);
+                WARN_ONCE(1, "Guest should always own CR0.TS");
+                vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
                trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
-                vmx_fpu_activate(vcpu);
                return kvm_skip_emulated_instruction(vcpu);
        case 1: /*mov from cr*/
                switch (cr) {
@@ -6151,12 +6061,8 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
 static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 {
-        u32 cpu_based_vm_exec_control;
+        vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                        CPU_BASED_VIRTUAL_INTR_PENDING);
-        /* clear pending irq */
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6382,6 +6288,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
                        EPT_VIOLATION_EXECUTABLE))
                      ? PFERR_PRESENT_MASK : 0;
+        vcpu->arch.gpa_available = true;
        vcpu->arch.exit_qualification = exit_qualification;
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6399,6 +6306,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        }
        ret = handle_mmio_page_fault(vcpu, gpa, true);
+        vcpu->arch.gpa_available = true;
        if (likely(ret == RET_MMIO_PF_EMULATE))
                return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
                                              EMULATE_DONE;
@@ -6420,12 +6328,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 static int handle_nmi_window(struct kvm_vcpu *vcpu)
 {
-        u32 cpu_based_vm_exec_control;
+        vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                        CPU_BASED_VIRTUAL_NMI_PENDING);
-        /* clear pending NMI */
-        cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-        cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
-        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
        ++vcpu->stat.nmi_window_exits;
        kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6663,8 +6567,10 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_ple())
                ple_gap = 0;
-        if (!cpu_has_vmx_apicv())
+        if (!cpu_has_vmx_apicv()) {
                enable_apicv = 0;
+                kvm_x86_ops->sync_pir_to_irr = NULL;
+        }
        if (cpu_has_vmx_tsc_scaling()) {
                kvm_has_tsc_control = true;
@@ -7134,6 +7040,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
        return 0;
 }
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        struct vmcs *shadow_vmcs;
+        if (cpu_has_vmx_msr_bitmap()) {
+                vmx->nested.msr_bitmap =
+                                (unsigned long *)__get_free_page(GFP_KERNEL);
+                if (!vmx->nested.msr_bitmap)
+                        goto out_msr_bitmap;
+        }
+        vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+        if (!vmx->nested.cached_vmcs12)
+                goto out_cached_vmcs12;
+        if (enable_shadow_vmcs) {
+                shadow_vmcs = alloc_vmcs();
+                if (!shadow_vmcs)
+                        goto out_shadow_vmcs;
+                /* mark vmcs as shadow */
+                shadow_vmcs->revision_id |= (1u << 31);
+                /* init shadow vmcs */
+                vmcs_clear(shadow_vmcs);
+                vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+        }
+        INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+        vmx->nested.vmcs02_num = 0;
+        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+                     HRTIMER_MODE_REL_PINNED);
+        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+        vmx->nested.vmxon = true;
+        return 0;
+out_shadow_vmcs:
+        kfree(vmx->nested.cached_vmcs12);
+out_cached_vmcs12:
+        free_page((unsigned long)vmx->nested.msr_bitmap);
+out_msr_bitmap:
+        return -ENOMEM;
+}
 /*
 * Emulate the VMXON instruction.
 * Currently, we just remember that VMX is active, and do not save or even
@@ -7144,9 +7097,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
 */
 static int handle_vmon(struct kvm_vcpu *vcpu)
 {
+        int ret;
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        struct vmcs *shadow_vmcs;
        const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
                | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7186,49 +7139,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
                return 1;
+ 
-        if (cpu_has_vmx_msr_bitmap()) {
+        ret = enter_vmx_operation(vcpu);
-                vmx->nested.msr_bitmap =
+        if (ret)
-                                (unsigned long *)__get_free_page(GFP_KERNEL);
+                return ret;
-                if (!vmx->nested.msr_bitmap)
-                        goto out_msr_bitmap;
-        }
-        vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
-        if (!vmx->nested.cached_vmcs12)
-                goto out_cached_vmcs12;
-        if (enable_shadow_vmcs) {
-                shadow_vmcs = alloc_vmcs();
-                if (!shadow_vmcs)
-                        goto out_shadow_vmcs;
-                /* mark vmcs as shadow */
-                shadow_vmcs->revision_id |= (1u << 31);
-                /* init shadow vmcs */
-                vmcs_clear(shadow_vmcs);
-                vmx->vmcs01.shadow_vmcs = shadow_vmcs;
-        }
-        INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
-        vmx->nested.vmcs02_num = 0;
-        hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
-                     HRTIMER_MODE_REL_PINNED);
-        vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-        vmx->nested.vmxon = true;
        nested_vmx_succeed(vcpu);
        return kvm_skip_emulated_instruction(vcpu);
-out_shadow_vmcs:
-        kfree(vmx->nested.cached_vmcs12);
-out_cached_vmcs12:
-        free_page((unsigned long)vmx->nested.msr_bitmap);
-out_msr_bitmap:
-        return -ENOMEM;
 }
 /*
@@ -7677,6 +7594,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
        return kvm_skip_emulated_instruction(vcpu);
 }
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+        vmx->nested.current_vmptr = vmptr;
+        if (enable_shadow_vmcs) {
+                vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                              SECONDARY_EXEC_SHADOW_VMCS);
+                vmcs_write64(VMCS_LINK_POINTER,
+                             __pa(vmx->vmcs01.shadow_vmcs));
+                vmx->nested.sync_shadow_vmcs = true;
+        }
+}
 /* Emulate the VMPTRLD instruction */
 static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
@@ -7707,7 +7636,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                }
                nested_release_vmcs12(vmx);
-                vmx->nested.current_vmptr = vmptr;
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
                /*
@@ -7716,14 +7644,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                 */
                memcpy(vmx->nested.cached_vmcs12,
                       vmx->nested.current_vmcs12, VMCS12_SIZE);
+                set_current_vmptr(vmx, vmptr);
-                if (enable_shadow_vmcs) {
-                        vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
-                                      SECONDARY_EXEC_SHADOW_VMCS);
-                        vmcs_write64(VMCS_LINK_POINTER,
-                                     __pa(vmx->vmcs01.shadow_vmcs));
-                        vmx->nested.sync_shadow_vmcs = true;
-                }
        }
        nested_vmx_succeed(vcpu);
@@ -8517,6 +8438,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        u32 vectoring_info = vmx->idt_vectoring_info;
        trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+        vcpu->arch.gpa_available = false;
        /*
         * Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -8735,6 +8657,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
        }
 }
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        int max_irr;
+        WARN_ON(!vcpu->arch.apicv_active);
+        if (pi_test_on(&vmx->pi_desc)) {
+                pi_clear_on(&vmx->pi_desc);
+                /*
+                 * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+                 * But on x86 this is just a compiler barrier anyway.
+                 */
+                smp_mb__after_atomic();
+                max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+        } else {
+                max_irr = kvm_lapic_find_highest_irr(vcpu);
+        }
+        vmx_hwapic_irr_update(vcpu, max_irr);
+        return max_irr;
+}
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
        if (!kvm_vcpu_apicv_active(vcpu))
@@ -8746,6 +8689,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
        vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
 }
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        pi_clear_on(&vmx->pi_desc);
+        memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
 static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -9591,17 +9542,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                kvm_inject_page_fault(vcpu, fault);
 }
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+                                               struct vmcs12 *vmcs12);
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                                        struct vmcs12 *vmcs12)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        int maxphyaddr = cpuid_maxphyaddr(vcpu);
+        u64 hpa;
        if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-                if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
-                    vmcs12->apic_access_addr >> maxphyaddr)
-                        return false;
                /*
                 * Translate L1 physical address to host physical
                 * address for vmcs02. Keep the page pinned, so this
@@ -9612,59 +9562,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
                        nested_release_page(vmx->nested.apic_access_page);
                vmx->nested.apic_access_page =
                        nested_get_page(vcpu, vmcs12->apic_access_addr);
+                /*
+                 * If translation failed, no matter: This feature asks
+                 * to exit when accessing the given address, and if it
+                 * can never be accessed, this feature won't do
+                 * anything anyway.
+                 */
+                if (vmx->nested.apic_access_page) {
+                        hpa = page_to_phys(vmx->nested.apic_access_page);
+                        vmcs_write64(APIC_ACCESS_ADDR, hpa);
+                } else {
+                        vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+                }
+        } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+                   cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+                vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                              SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+                kvm_vcpu_reload_apic_access_page(vcpu);
        }
        if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-                if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
-                    vmcs12->virtual_apic_page_addr >> maxphyaddr)
-                        return false;
                if (vmx->nested.virtual_apic_page) /* shouldn't happen */
                        nested_release_page(vmx->nested.virtual_apic_page);
                vmx->nested.virtual_apic_page =
                        nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
                /*
-                 * Failing the vm entry is _not_ what the processor does
+                 * If translation failed, VM entry will fail because
-                 * but it's basically the only possibility we have.
+                 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
-                 * We could still enter the guest if CR8 load exits are
+                 * Failing the vm entry is _not_ what the processor
-                 * enabled, CR8 store exits are enabled, and virtualize APIC
+                 * does but it's basically the only possibility we
-                 * access is disabled; in this case the processor would never
+                 * have.  We could still enter the guest if CR8 load
-                 * use the TPR shadow and we could simply clear the bit from
+                 * exits are enabled, CR8 store exits are enabled, and
-                 * the execution control.  But such a configuration is useless,
+                 * virtualize APIC access is disabled; in this case
-                 * so let's keep the code simple.
+                 * the processor would never use the TPR shadow and we
+                 * could simply clear the bit from the execution
+                 * control.  But such a configuration is useless, so
+                 * let's keep the code simple.
                 */
-                if (!vmx->nested.virtual_apic_page)
+                if (vmx->nested.virtual_apic_page) {
-                        return false;
+                        hpa = page_to_phys(vmx->nested.virtual_apic_page);
+                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+                }
        }
        if (nested_cpu_has_posted_intr(vmcs12)) {
-                if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
-                    vmcs12->posted_intr_desc_addr >> maxphyaddr)
-                        return false;
                if (vmx->nested.pi_desc_page) { /* shouldn't happen */
                        kunmap(vmx->nested.pi_desc_page);
                        nested_release_page(vmx->nested.pi_desc_page);
                }
                vmx->nested.pi_desc_page =
                        nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
-                if (!vmx->nested.pi_desc_page)
-                        return false;
                vmx->nested.pi_desc =
                        (struct pi_desc *)kmap(vmx->nested.pi_desc_page);
                if (!vmx->nested.pi_desc) {
                        nested_release_page_clean(vmx->nested.pi_desc_page);
-                        return false;
+                        return;
                }
                vmx->nested.pi_desc =
                        (struct pi_desc *)((void *)vmx->nested.pi_desc +
                        (unsigned long)(vmcs12->posted_intr_desc_addr &
                        (PAGE_SIZE - 1)));
+                vmcs_write64(POSTED_INTR_DESC_ADDR,
+                        page_to_phys(vmx->nested.pi_desc_page) +
+                        (unsigned long)(vmcs12->posted_intr_desc_addr &
+                        (PAGE_SIZE - 1)));
        }
+        if (cpu_has_vmx_msr_bitmap() &&
-        return true;
+            nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+            nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+                ;
+        else
+                vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+                                CPU_BASED_USE_MSR_BITMAPS);
 }
 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9980,7 +9951,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
 * is assigned to entry_failure_code on failure.
 */
 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
-                               unsigned long *entry_failure_code)
+                               u32 *entry_failure_code)
 {
        if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
                if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10020,7 +9991,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 * is assigned to entry_failure_code on failure.
 */
 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
-                          unsigned long *entry_failure_code)
+                          bool from_vmentry, u32 *entry_failure_code)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
@@ -10063,21 +10034,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+        if (from_vmentry &&
+            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
                kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
                vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
        } else {
                kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
                vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
        }
-        vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+        if (from_vmentry) {
-                vmcs12->vm_entry_intr_info_field);
+                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-        vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+                             vmcs12->vm_entry_intr_info_field);
-                vmcs12->vm_entry_exception_error_code);
+                vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
-        vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+                             vmcs12->vm_entry_exception_error_code);
-                vmcs12->vm_entry_instruction_len);
+                vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
-        vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                             vmcs12->vm_entry_instruction_len);
-                vmcs12->guest_interruptibility_info);
+                vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+                             vmcs12->guest_interruptibility_info);
+        } else {
+                vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+        }
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10106,12 +10082,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
                vmx->nested.pi_pending = false;
                vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
-                vmcs_write64(POSTED_INTR_DESC_ADDR,
+        } else {
-                        page_to_phys(vmx->nested.pi_desc_page) +
-                        (unsigned long)(vmcs12->posted_intr_desc_addr &
-                        (PAGE_SIZE - 1)));
-        } else
                exec_control &= ~PIN_BASED_POSTED_INTR;
+        }
        vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
@@ -10156,26 +10129,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
-                if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
-                        /*
-                         * If translation failed, no matter: This feature asks
-                         * to exit when accessing the given address, and if it
-                         * can never be accessed, this feature won't do
-                         * anything anyway.
-                         */
-                        if (!vmx->nested.apic_access_page)
-                                exec_control &=
-                                  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-                        else
-                                vmcs_write64(APIC_ACCESS_ADDR,
-                                  page_to_phys(vmx->nested.apic_access_page));
-                } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-                            cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
-                        exec_control |=
-                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-                        kvm_vcpu_reload_apic_access_page(vcpu);
-                }
                if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
                        vmcs_write64(EOI_EXIT_BITMAP0,
                                vmcs12->eoi_exit_bitmap0);
@@ -10190,6 +10143,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                }
                nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
+                /*
+                 * Write an illegal value to APIC_ACCESS_ADDR. Later,
+                 * nested_get_vmcs12_pages will either fix it up or
+                 * remove the VM execution control.
+                 */
+                if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+                        vmcs_write64(APIC_ACCESS_ADDR, -1ull);
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
        }
@@ -10226,19 +10188,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        exec_control &= ~CPU_BASED_TPR_SHADOW;
        exec_control |= vmcs12->cpu_based_vm_exec_control;
+        /*
+         * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+         * nested_get_vmcs12_pages can't fix it up, the illegal value
+         * will result in a VM entry failure.
+         */
        if (exec_control & CPU_BASED_TPR_SHADOW) {
-                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
+                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
-                                page_to_phys(vmx->nested.virtual_apic_page));
                vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
        }
-        if (cpu_has_vmx_msr_bitmap() &&
-            exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-            nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
-                ; /* MSR_BITMAP will be set by following vmx_set_efer. */
-        else
-                exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
        /*
         * Merging of IO bitmap not currently supported.
         * Rather, exit every time.
@@ -10270,16 +10229,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                        ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+        if (from_vmentry &&
+            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
                vcpu->arch.pat = vmcs12->guest_ia32_pat;
-        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+        } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+        }
        set_cr4_guest_host_mask(vmx);
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+        if (from_vmentry &&
+            vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
                vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
        if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10318,8 +10279,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
        /*
-         * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+         * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
-         * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+         * bits which we consider mandatory enabled.
         * The CR0_READ_SHADOW is what L2 should have expected to read given
         * the specifications by L1; It's not enough to take
         * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10331,7 +10292,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmx_set_cr4(vcpu, vmcs12->guest_cr4);
        vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+        if (from_vmentry &&
+            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
                vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10365,73 +10327,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        return 0;
 }
-/*
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 {
-        struct vmcs12 *vmcs12;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        int cpu;
-        struct loaded_vmcs *vmcs02;
-        bool ia32e;
-        u32 msr_entry_idx;
-        unsigned long exit_qualification;
-        if (!nested_vmx_check_permission(vcpu))
-                return 1;
-        if (!nested_vmx_check_vmcs12(vcpu))
-                goto out;
-        vmcs12 = get_vmcs12(vcpu);
-        if (enable_shadow_vmcs)
-                copy_shadow_to_vmcs12(vmx);
-        /*
-         * The nested entry process starts with enforcing various prerequisites
-         * on vmcs12 as required by the Intel SDM, and act appropriately when
-         * they fail: As the SDM explains, some conditions should cause the
-         * instruction to fail, while others will cause the instruction to seem
-         * to succeed, but return an EXIT_REASON_INVALID_STATE.
-         * To speed up the normal (success) code path, we should avoid checking
-         * for misconfigurations which will anyway be caught by the processor
-         * when using the merged vmcs02.
-         */
-        if (vmcs12->launch_state == launch) {
-                nested_vmx_failValid(vcpu,
-                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
-                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
-                goto out;
-        }
        if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
-            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
+            vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-                goto out;
-        }
-        if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
+        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-                goto out;
-        }
-        if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-                goto out;
-        }
-        if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
+        if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-                goto out;
-        }
-        if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
+        if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-                goto out;
-        }
        if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
                                vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10448,28 +10359,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
            !vmx_control_verify(vmcs12->vm_entry_controls,
                                vmx->nested.nested_vmx_entry_ctls_low,
                                vmx->nested.nested_vmx_entry_ctls_high))
-        {
+                return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
-                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
-                goto out;
-        }
        if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
            !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
-            !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
+            !nested_cr3_valid(vcpu, vmcs12->host_cr3))
-                nested_vmx_failValid(vcpu,
+                return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
-                        VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
-                goto out;
+        return 0;
-        }
+}
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                                  u32 *exit_qual)
+{
+        bool ia32e;
+        *exit_qual = ENTRY_FAIL_DEFAULT;
        if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
-            !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
+            !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
-                nested_vmx_entry_failure(vcpu, vmcs12,
-                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
                return 1;
-        }
-        if (vmcs12->vmcs_link_pointer != -1ull) {
+        if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
-                nested_vmx_entry_failure(vcpu, vmcs12,
+            vmcs12->vmcs_link_pointer != -1ull) {
-                        EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+                *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
                return 1;
        }
@@ -10482,16 +10395,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
         *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
         *   CR0.PG) is 1.
         */
-        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+        if (to_vmx(vcpu)->nested.nested_run_pending &&
+            (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
                ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
                if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
                    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
                    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
-                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
+                     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
-                        nested_vmx_entry_failure(vcpu, vmcs12,
-                                EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
                        return 1;
-                }
        }
        /*
@@ -10505,28 +10416,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                         VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
                if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
-                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
+                    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
-                        nested_vmx_entry_failure(vcpu, vmcs12,
-                                EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
                        return 1;
-                }
        }
-        /*
+        return 0;
-         * We're finally done with prerequisite checking, and can start with
+}
-         * the nested entry.
-         */
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+{
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+        struct loaded_vmcs *vmcs02;
+        int cpu;
+        u32 msr_entry_idx;
+        u32 exit_qual;
        vmcs02 = nested_get_current_vmcs02(vmx);
        if (!vmcs02)
                return -ENOMEM;
-        /*
-         * After this point, the trap flag no longer triggers a singlestep trap
-         * on the vm entry instructions. Don't call
-         * kvm_skip_emulated_instruction.
-         */
-        skip_emulated_instruction(vcpu);
        enter_guest_mode(vcpu);
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10541,14 +10450,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vmx_segment_cache_clear(vmx);
-        if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+        if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
                leave_guest_mode(vcpu);
                vmx_load_vmcs01(vcpu);
                nested_vmx_entry_failure(vcpu, vmcs12,
-                                EXIT_REASON_INVALID_STATE, exit_qualification);
+                                         EXIT_REASON_INVALID_STATE, exit_qual);
                return 1;
        }
+        nested_get_vmcs12_pages(vcpu, vmcs12);
        msr_entry_idx = nested_vmx_load_msr(vcpu,
                                            vmcs12->vm_entry_msr_load_addr,
                                            vmcs12->vm_entry_msr_load_count);
@@ -10562,17 +10473,90 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        vmcs12->launch_state = 1;
-        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-                return kvm_vcpu_halt(vcpu);
-        vmx->nested.nested_run_pending = 1;
        /*
         * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
         * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
         * returned as far as L1 is concerned. It will only return (and set
         * the success flag) when L2 exits (see nested_vmx_vmexit()).
         */
+        return 0;
+}
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+        struct vmcs12 *vmcs12;
+        struct vcpu_vmx *vmx = to_vmx(vcpu);
+        u32 exit_qual;
+        int ret;
+        if (!nested_vmx_check_permission(vcpu))
+                return 1;
+        if (!nested_vmx_check_vmcs12(vcpu))
+                goto out;
+        vmcs12 = get_vmcs12(vcpu);
+        if (enable_shadow_vmcs)
+                copy_shadow_to_vmcs12(vmx);
+        /*
+         * The nested entry process starts with enforcing various prerequisites
+         * on vmcs12 as required by the Intel SDM, and act appropriately when
+         * they fail: As the SDM explains, some conditions should cause the
+         * instruction to fail, while others will cause the instruction to seem
+         * to succeed, but return an EXIT_REASON_INVALID_STATE.
+         * To speed up the normal (success) code path, we should avoid checking
+         * for misconfigurations which will anyway be caught by the processor
+         * when using the merged vmcs02.
+         */
+        if (vmcs12->launch_state == launch) {
+                nested_vmx_failValid(vcpu,
+                        launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+                               : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+                goto out;
+        }
+        ret = check_vmentry_prereqs(vcpu, vmcs12);
+        if (ret) {
+                nested_vmx_failValid(vcpu, ret);
+                goto out;
+        }
+        /*
+         * After this point, the trap flag no longer triggers a singlestep trap
+         * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+         * This is not 100% correct; for performance reasons, we delegate most
+         * of the checks on host state to the processor.  If those fail,
+         * the singlestep trap is missed.
+         */
+        skip_emulated_instruction(vcpu);
+        ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+        if (ret) {
+                nested_vmx_entry_failure(vcpu, vmcs12,
+                                         EXIT_REASON_INVALID_STATE, exit_qual);
+                return 1;
+        }
+        /*
+         * We're finally done with prerequisite checking, and can start with
+         * the nested entry.
+         */
+        ret = enter_vmx_non_root_mode(vcpu, true);
+        if (ret)
+                return ret;
+        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+                return kvm_vcpu_halt(vcpu);
+        vmx->nested.nested_run_pending = 1;
        return 1;
 out:
@@ -10713,21 +10697,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 }
 /*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * Update the guest state fields of vmcs12 to reflect changes that
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
- * and this function updates it to reflect the changes to the guest state while
+ * VM-entry controls is also updated, since this is really a guest
- * L2 was running (and perhaps made some exits which were handled directly by L0
+ * state bit.)
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
 */
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
-                           u32 exit_reason, u32 exit_intr_info,
-                           unsigned long exit_qualification)
 {
-        /* update guest state fields: */
        vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
        vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -10833,6 +10809,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
        if (nested_cpu_has_xsaves(vmcs12))
                vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+                           u32 exit_reason, u32 exit_intr_info,
+                           unsigned long exit_qualification)
+{
+        /* update guest state fields: */
+        sync_vmcs12(vcpu, vmcs12);
        /* update exit information fields: */
@@ -10883,7 +10878,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
        struct kvm_segment seg;
-        unsigned long entry_failure_code;
+        u32 entry_failure_code;
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10898,24 +10893,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
        /*
         * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
-         * actually changed, because it depends on the current state of
+         * actually changed, because vmx_set_cr0 refers to efer set above.
-         * fpu_active (which may have changed).
+         *
-         * Note that vmx_set_cr0 refers to efer set above.
+         * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+         * (KVM doesn't change it);
         */
+        vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
        vmx_set_cr0(vcpu, vmcs12->host_cr0);
-        /*
-         * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
-         * to apply the same changes to L1's vmcs. We just set cr0 correctly,
-         * but we also need to update cr0_guest_host_mask and exception_bitmap.
-         */
-        update_exception_bitmap(vcpu);
-        vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
-        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-        /*
+        /* Same as above - no reason to call set_cr4_guest_host_mask().  */
-         * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
-         * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
-         */
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
@@ -11544,9 +11530,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .get_pkru = vmx_get_pkru,
-        .fpu_activate = vmx_fpu_activate,
-        .fpu_deactivate = vmx_fpu_deactivate,
        .tlb_flush = vmx_flush_tlb,
        .run = vmx_vcpu_run,
@@ -11571,6 +11554,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .get_enable_apicv = vmx_get_enable_apicv,
        .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
        .load_eoi_exitmap = vmx_load_eoi_exitmap,
+        .apicv_post_state_restore = vmx_apicv_post_state_restore,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
        .sync_pir_to_irr = vmx_sync_pir_to_irr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2f64e5d0ae53..c48404017e4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1811,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
        struct kvm_vcpu_arch *vcpu = &v->arch;
        struct pvclock_vcpu_time_info guest_hv_clock;
-        if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+        if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
                &guest_hv_clock, sizeof(guest_hv_clock))))
                return;
@@ -1832,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
        BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
        vcpu->hv_clock.version = guest_hv_clock.version + 1;
-        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+        kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-                                &vcpu->hv_clock,
+                                    &vcpu->hv_clock,
-                                sizeof(vcpu->hv_clock.version));
+                                    sizeof(vcpu->hv_clock.version));
        smp_wmb();
@@ -1848,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
        trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+        kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-                                &vcpu->hv_clock,
+                                    &vcpu->hv_clock,
-                                sizeof(vcpu->hv_clock));
+                                    sizeof(vcpu->hv_clock));
        smp_wmb();
        vcpu->hv_clock.version++;
-        kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+        kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
-                                &vcpu->hv_clock,
+                                    &vcpu->hv_clock,
-                                sizeof(vcpu->hv_clock.version));
+                                    sizeof(vcpu->hv_clock.version));
 }
 static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2090,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
                return 0;
        }
-        if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+        if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
                                        sizeof(u32)))
                return 1;
@@ -2109,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
        if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
                return;
-        if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+        if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
                return;
@@ -2120,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
        vcpu->arch.st.steal.version += 1;
-        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+        kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
        smp_wmb();
@@ -2129,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
                vcpu->arch.st.last_steal;
        vcpu->arch.st.last_steal = current->sched_info.run_delay;
-        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+        kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
        smp_wmb();
        vcpu->arch.st.steal.version += 1;
-        kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+        kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
                &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
 }
@@ -2241,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (!(data & 1))
                        break;
-                if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+                if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
                     &vcpu->arch.pv_time, data & ~1ULL,
                     sizeof(struct pvclock_vcpu_time_info)))
                        vcpu->arch.pv_time_enabled = false;
@@ -2262,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                if (data & KVM_STEAL_RESERVED_MASK)
                        return 1;
-                if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+                if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
                                                data & KVM_STEAL_VALID_BITS,
                                                sizeof(struct kvm_steal_time)))
                        return 1;
@@ -2672,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_DISABLE_QUIRKS:
        case KVM_CAP_SET_BOOT_CPU_ID:
        case KVM_CAP_SPLIT_IRQCHIP:
+        case KVM_CAP_IMMEDIATE_EXIT:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
        case KVM_CAP_ASSIGN_DEV_IRQ:
        case KVM_CAP_PCI_2_3:
@@ -2875,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
        vcpu->arch.st.steal.preempted = 1;
-        kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+        kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
                        &vcpu->arch.st.steal.preempted,
                        offsetof(struct kvm_steal_time, preempted),
                        sizeof(vcpu->arch.st.steal.preempted));
@@ -2909,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                    struct kvm_lapic_state *s)
 {
-        if (vcpu->arch.apicv_active)
+        if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
                kvm_x86_ops->sync_pir_to_irr(vcpu);
        return kvm_apic_get_state(vcpu, s);
@@ -6659,7 +6660,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
        if (irqchip_split(vcpu->kvm))
                kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
        else {
-                if (vcpu->arch.apicv_active)
+                if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
                        kvm_x86_ops->sync_pir_to_irr(vcpu);
                kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
        }
@@ -6750,10 +6751,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        r = 0;
                        goto out;
                }
-                if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
-                        vcpu->fpu_active = 0;
-                        kvm_x86_ops->fpu_deactivate(vcpu);
-                }
                if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
                        /* Page is swapped out. Do synthetic halt */
                        vcpu->arch.apf.halted = true;
@@ -6813,20 +6810,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_hv_process_stimers(vcpu);
        }
-        /*
-         * KVM_REQ_EVENT is not set when posted interrupts are set by
-         * VT-d hardware, so we have to update RVI unconditionally.
-         */
-        if (kvm_lapic_enabled(vcpu)) {
-                /*
-                 * Update architecture specific hints for APIC
-                 * virtual interrupt delivery.
-                 */
-                if (vcpu->arch.apicv_active)
-                        kvm_x86_ops->hwapic_irr_update(vcpu,
-                                kvm_lapic_find_highest_irr(vcpu));
-        }
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
                ++vcpu->stat.req_event;
                kvm_apic_accept_events(vcpu);
@@ -6869,22 +6852,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
        preempt_disable();
        kvm_x86_ops->prepare_guest_switch(vcpu);
-        if (vcpu->fpu_active)
+        kvm_load_guest_fpu(vcpu);
-                kvm_load_guest_fpu(vcpu);
+        /*
+         * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
+         * IPI are then delayed after guest entry, which ensures that they
+         * result in virtual interrupt delivery.
+         */
+        local_irq_disable();
        vcpu->mode = IN_GUEST_MODE;
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
        /*
-         * We should set ->mode before check ->requests,
+         * 1) We should set ->mode before checking ->requests.  Please see
-         * Please see the comment in kvm_make_all_cpus_request.
+         * the comment in kvm_make_all_cpus_request.
-         * This also orders the write to mode from any reads
+         *
-         * to the page tables done while the VCPU is running.
+         * 2) For APICv, we should set ->mode before checking PIR.ON.  This
-         * Please see the comment in kvm_flush_remote_tlbs.
+         * pairs with the memory barrier implicit in pi_test_and_set_on
+         * (see vmx_deliver_posted_interrupt).
+         *
+         * 3) This also orders the write to mode from any reads to the page
+         * tables done while the VCPU is running.  Please see the comment
+         * in kvm_flush_remote_tlbs.
         */
        smp_mb__after_srcu_read_unlock();
-        local_irq_disable();
+        /*
+         * This handles the case where a posted interrupt was
+         * notified with kvm_vcpu_kick.
+         */
+        if (kvm_lapic_enabled(vcpu)) {
+                if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+                        kvm_x86_ops->sync_pir_to_irr(vcpu);
+        }
        if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
            || need_resched() || signal_pending(current)) {
@@ -7023,6 +7024,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
 {
+        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+                kvm_x86_ops->check_nested_events(vcpu, false);
        return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
                !vcpu->arch.apf.halted);
 }
@@ -7194,7 +7198,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
-        r = vcpu_run(vcpu);
+        if (kvm_run->immediate_exit)
+                r = -EINTR;
+        else
+                r = vcpu_run(vcpu);
 out:
        post_kvm_run_save(vcpu);
@@ -8389,9 +8396,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
-        if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
-                kvm_x86_ops->check_nested_events(vcpu, false);
        return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
 }
@@ -8528,9 +8532,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
 static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 {
+        return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
-        return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+                                           sizeof(val));
-                                      sizeof(val));
 }
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/drivers/ptp/ptp_kvm.c b/drivers/ptp/ptp_kvm.c
index 0a54e8326a90..09b4df74291e 100644
--- a/drivers/ptp/ptp_kvm.c
+++ b/drivers/ptp/ptp_kvm.c
@@ -176,12 +176,19 @@ static void __exit ptp_kvm_exit(void)
 static int __init ptp_kvm_init(void)
 {
+        long ret;
        clock_pair_gpa = slow_virt_to_phys(&clock_pair);
        hv_clock = pvclock_pvti_cpu0_va();
        if (!hv_clock)
                return -ENODEV;
+        ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa,
+                        KVM_CLOCK_PAIRING_WALLCLOCK);
+        if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP)
+                return -ENODEV;
        kvm_ptp_clock.caps = ptp_kvm_caps;
        kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cda457bcedc1..8d69d5150748 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -221,7 +221,6 @@ struct kvm_vcpu {
        struct mutex mutex;
        struct kvm_run *run;
-        int fpu_active;
        int guest_fpu_loaded, guest_xcr0_loaded;
        struct swait_queue_head wq;
        struct pid *pid;
@@ -641,18 +640,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
                          unsigned long len);
 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-                           void *data, unsigned long len);
+                               void *data, unsigned long len);
 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len);
 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
                    unsigned long len);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-                           void *data, unsigned long len);
+                                void *data, unsigned long len);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-                           void *data, int offset, unsigned long len);
+                                       void *data, int offset, unsigned long len);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc,
-                              gpa_t gpa, unsigned long len);
+                                   gpa_t gpa, unsigned long len);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len);
 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 7964b970b9ad..f51d5082a377 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -218,7 +218,8 @@ struct kvm_hyperv_exit {
 struct kvm_run {
        /* in */
        __u8 request_interrupt_window;
-        __u8 padding1[7];
+        __u8 immediate_exit;
+        __u8 padding1[6];
        /* out */
        __u32 exit_reason;
@@ -881,6 +882,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SPAPR_RESIZE_HPT 133
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
+#define KVM_CAP_IMMEDIATE_EXIT 136
 #ifdef KVM_CAP_IRQ_ROUTING
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 482612b4e496..cc4d6e0dd2a2 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -506,11 +506,6 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
        if (!slots)
                return NULL;
-        /*
-         * Init kvm generation close to the maximum to easily test the
-         * code of handling generation number wrap-around.
-         */
-        slots->generation = -150;
        for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
                slots->id_to_index[i] = slots->memslots[i].id = i;
@@ -641,9 +636,16 @@ static struct kvm *kvm_create_vm(unsigned long type)
        r = -ENOMEM;
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
-                kvm->memslots[i] = kvm_alloc_memslots();
+                struct kvm_memslots *slots = kvm_alloc_memslots();
-                if (!kvm->memslots[i])
+                if (!slots)
                        goto out_err_no_srcu;
+                /*
+                 * Generations must be different for each address space.
+                 * Init kvm generation close to the maximum to easily test the
+                 * code of handling generation number wrap-around.
+                 */
+                slots->generation = i * 2 - 150;
+                rcu_assign_pointer(kvm->memslots[i], slots);
        }
        if (init_srcu_struct(&kvm->srcu))
@@ -870,8 +872,14 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
         * Increment the new memslot generation a second time. This prevents
         * vm exits that race with memslot updates from caching a memslot
         * generation that will (potentially) be valid forever.
+         *
+         * Generations must be unique even across address spaces.  We do not need
+         * a global counter for that, instead the generation space is evenly split
+         * across address spaces.  For example, with two address spaces, address
+         * space 0 will use generations 0, 4, 8, ... while * address space 1 will
+         * use generations 2, 6, 10, 14, ...
         */
-        slots->generation++;
+        slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
        kvm_arch_memslots_updated(kvm, slots);
@@ -1094,37 +1102,31 @@ int kvm_get_dirty_log(struct kvm *kvm,
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
-        int r, i, as_id, id;
+        int i, as_id, id;
        unsigned long n;
        unsigned long any = 0;
-        r = -EINVAL;
        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
-                goto out;
+                return -EINVAL;
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
-        r = -ENOENT;
        if (!memslot->dirty_bitmap)
-                goto out;
+                return -ENOENT;
        n = kvm_dirty_bitmap_bytes(memslot);
        for (i = 0; !any && i < n/sizeof(long); ++i)
                any = memslot->dirty_bitmap[i];
-        r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
-                goto out;
+                return -EFAULT;
        if (any)
                *is_dirty = 1;
+        return 0;
-        r = 0;
-out:
-        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
@@ -1156,24 +1158,22 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 {
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
-        int r, i, as_id, id;
+        int i, as_id, id;
        unsigned long n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
-        r = -EINVAL;
        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS)
-                goto out;
+                return -EINVAL;
        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
        dirty_bitmap = memslot->dirty_bitmap;
-        r = -ENOENT;
        if (!dirty_bitmap)
-                goto out;
+                return -ENOENT;
        n = kvm_dirty_bitmap_bytes(memslot);
@@ -1202,14 +1202,9 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
        }
        spin_unlock(&kvm->mmu_lock);
-        r = -EFAULT;
        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
-                goto out;
+                return -EFAULT;
+        return 0;
-        r = 0;
-out:
-        return r;
 }
 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
 #endif
@@ -1937,10 +1932,10 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
-int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
-                              gpa_t gpa, unsigned long len)
+                                       struct gfn_to_hva_cache *ghc,
+                                       gpa_t gpa, unsigned long len)
 {
-        struct kvm_memslots *slots = kvm_memslots(kvm);
        int offset = offset_in_page(gpa);
        gfn_t start_gfn = gpa >> PAGE_SHIFT;
        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
@@ -1950,7 +1945,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        ghc->gpa = gpa;
        ghc->generation = slots->generation;
        ghc->len = len;
-        ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+        ghc->memslot = __gfn_to_memslot(slots, start_gfn);
        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL);
        if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) {
                ghc->hva += offset;
@@ -1960,7 +1955,7 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                 * verify that the entire region is valid here.
                 */
                while (start_gfn <= end_gfn) {
-                        ghc->memslot = gfn_to_memslot(kvm, start_gfn);
+                        ghc->memslot = __gfn_to_memslot(slots, start_gfn);
                        ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
                                                   &nr_pages_avail);
                        if (kvm_is_error_hva(ghc->hva))
@@ -1972,22 +1967,29 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        }
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
-int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-                           void *data, int offset, unsigned long len)
+                              gpa_t gpa, unsigned long len)
 {
-        struct kvm_memslots *slots = kvm_memslots(kvm);
+        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
+        return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init);
+int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
+                                       void *data, int offset, unsigned long len)
+{
+        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
        int r;
        gpa_t gpa = ghc->gpa + offset;
        BUG_ON(len + offset > ghc->len);
        if (slots->generation != ghc->generation)
-                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+                __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
        if (unlikely(!ghc->memslot))
-                return kvm_write_guest(kvm, gpa, data, len);
+                return kvm_vcpu_write_guest(vcpu, gpa, data, len);
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
@@ -1999,28 +2001,28 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached);
-int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-                           void *data, unsigned long len)
+                               void *data, unsigned long len)
 {
-        return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
+        return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len);
 }
-EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached);
-int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
+int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc,
-                           void *data, unsigned long len)
+                               void *data, unsigned long len)
 {
-        struct kvm_memslots *slots = kvm_memslots(kvm);
+        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
        int r;
        BUG_ON(len > ghc->len);
        if (slots->generation != ghc->generation)
-                kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len);
+                __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
        if (unlikely(!ghc->memslot))
-                return kvm_read_guest(kvm, ghc->gpa, data, len);
+                return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len);
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
@@ -2031,7 +2033,7 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
        return 0;
 }
-EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
+EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached);
 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
 {
@@ -3133,10 +3135,9 @@ static long kvm_vm_compat_ioctl(struct file *filp,
                struct compat_kvm_dirty_log compat_log;
                struct kvm_dirty_log log;
-                r = -EFAULT;
                if (copy_from_user(&compat_log, (void __user *)arg,
                                   sizeof(compat_log)))
-                        goto out;
+                        return -EFAULT;
                log.slot         = compat_log.slot;
                log.padding1     = compat_log.padding1;
                log.padding2     = compat_log.padding2;
@@ -3148,8 +3149,6 @@ static long kvm_vm_compat_ioctl(struct file *filp,
        default:
                r = kvm_vm_ioctl(filp, ioctl, arg);
        }
-out:
        return r;
 }
 #endif