19 files changed, 261 insertions, 95 deletions
diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h
index b9db269c6e61..66ce17655bb9 100644
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@@ -33,6 +33,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.hcr = HCR_GUEST_MASK;
+}
 static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
 {
        return 1;
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 53036e21756b..254e0650e48b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -150,8 +150,6 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-                        const struct kvm_vcpu_init *init);
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index acb0d5712716..63e0ecc04901 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -52,6 +52,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
@@ -161,9 +162,10 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 }
 static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-                                             unsigned long size)
+                                             unsigned long size,
+                                             bool ipa_uncached)
 {
-        if (!vcpu_has_cache_enabled(vcpu))
+        if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
                kvm_flush_dcache_to_poc((void *)hva, size);
        
        /*
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9e193c8a959e..2d6d91001062 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -213,6 +213,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        int err;
        struct kvm_vcpu *vcpu;
+        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
+                err = -EBUSY;
+                goto out;
+        }
        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
        if (!vcpu) {
                err = -ENOMEM;
@@ -263,6 +268,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
        /* Force users to call KVM_ARM_VCPU_INIT */
        vcpu->arch.target = -1;
+        bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
        /* Set up the timer */
        kvm_timer_vcpu_init(vcpu);
@@ -419,6 +425,7 @@ static void update_vttbr(struct kvm *kvm)
 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 {
+        struct kvm *kvm = vcpu->kvm;
        int ret;
        if (likely(vcpu->arch.has_run_once))
@@ -427,15 +434,23 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
        vcpu->arch.has_run_once = true;
        /*
-         * Initialize the VGIC before running a vcpu the first time on
+         * Map the VGIC hardware resources before running a vcpu the first
-         * this VM.
+         * time on this VM.
         */
-        if (unlikely(!vgic_initialized(vcpu->kvm))) {
+        if (unlikely(!vgic_ready(kvm))) {
-                ret = kvm_vgic_init(vcpu->kvm);
+                ret = kvm_vgic_map_resources(kvm);
                if (ret)
                        return ret;
        }
+        /*
+         * Enable the arch timers only if we have an in-kernel VGIC
+         * and it has been properly initialized, since we cannot handle
+         * interrupts from the virtual timer with a userspace gic.
+         */
+        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+                kvm_timer_enable(kvm);
        return 0;
 }
@@ -649,6 +664,48 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
        return -EINVAL;
 }
+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+                               const struct kvm_vcpu_init *init)
+{
+        unsigned int i;
+        int phys_target = kvm_target_cpu();
+        if (init->target != phys_target)
+                return -EINVAL;
+        /*
+         * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+         * use the same target.
+         */
+        if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+                return -EINVAL;
+        /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
+        for (i = 0; i < sizeof(init->features) * 8; i++) {
+                bool set = (init->features[i / 32] & (1 << (i % 32)));
+                if (set && i >= KVM_VCPU_MAX_FEATURES)
+                        return -ENOENT;
+                /*
+                 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+                 * use the same feature set.
+                 */
+                if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
+                    test_bit(i, vcpu->arch.features) != set)
+                        return -EINVAL;
+                if (set)
+                        set_bit(i, vcpu->arch.features);
+        }
+        vcpu->arch.target = phys_target;
+        /* Now we know what it is, we can reset it. */
+        return kvm_reset_vcpu(vcpu);
+}
 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
                                         struct kvm_vcpu_init *init)
 {
@@ -659,10 +716,21 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
                return ret;
        /*
+         * Ensure a rebooted VM will fault in RAM pages and detect if the
+         * guest MMU is turned off and flush the caches as needed.
+         */
+        if (vcpu->arch.has_run_once)
+                stage2_unmap_vm(vcpu->kvm);
+        vcpu_reset_hcr(vcpu);
+        /*
         * Handle the "start in power-off" case by marking the VCPU as paused.
         */
-        if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+        if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
                vcpu->arch.pause = true;
+        else
+                vcpu->arch.pause = false;
        return 0;
 }
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index cc0b78769bd8..384bab67c462 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-        vcpu->arch.hcr = HCR_GUEST_MASK;
        return 0;
 }
@@ -274,31 +273,6 @@ int __attribute_const__ kvm_target_cpu(void)
        }
 }
-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-                        const struct kvm_vcpu_init *init)
-{
-        unsigned int i;
-        /* We can only cope with guest==host and only on A15/A7 (for now). */
-        if (init->target != kvm_target_cpu())
-                return -EINVAL;
-        vcpu->arch.target = init->target;
-        bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-        /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-        for (i = 0; i < sizeof(init->features) * 8; i++) {
-                if (test_bit(i, (void *)init->features)) {
-                        if (i >= KVM_VCPU_MAX_FEATURES)
-                                return -ENOENT;
-                        set_bit(i, vcpu->arch.features);
-                }
-        }
-        /* Now we know what it is, we can reset it. */
-        return kvm_reset_vcpu(vcpu);
-}
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 {
        int target = kvm_target_cpu();
diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 4cb5a93182e9..5d3bfc0eb3f0 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -187,15 +187,18 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
        }
        rt = vcpu->arch.mmio_decode.rt;
-        data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len);
-        trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE :
+        if (mmio.is_write) {
-                                         KVM_TRACE_MMIO_READ_UNSATISFIED,
+                data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt),
-                        mmio.len, fault_ipa,
+                                               mmio.len);
-                        (mmio.is_write) ? data : 0);
-        if (mmio.is_write)
+                trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, mmio.len,
+                               fault_ipa, data);
                mmio_write_buf(mmio.data, mmio.len, data);
+        } else {
+                trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len,
+                               fault_ipa, 0);
+        }
        if (vgic_handle_mmio(vcpu, run, &mmio))
                return 1;
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 57a403a5c22b..3756dd3e85c2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -611,6 +611,71 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
        unmap_range(kvm, kvm->arch.pgd, start, size);
 }
+static void stage2_unmap_memslot(struct kvm *kvm,
+                                 struct kvm_memory_slot *memslot)
+{
+        hva_t hva = memslot->userspace_addr;
+        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+        phys_addr_t size = PAGE_SIZE * memslot->npages;
+        hva_t reg_end = hva + size;
+        /*
+         * A memory region could potentially cover multiple VMAs, and any holes
+         * between them, so iterate over all of them to find out if we should
+         * unmap any of them.
+         *
+         *     +--------------------------------------------+
+         * +---------------+----------------+   +----------------+
+         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+         * +---------------+----------------+   +----------------+
+         *     |               memory region                |
+         *     +--------------------------------------------+
+         */
+        do {
+                struct vm_area_struct *vma = find_vma(current->mm, hva);
+                hva_t vm_start, vm_end;
+                if (!vma || vma->vm_start >= reg_end)
+                        break;
+                /*
+                 * Take the intersection of this VMA with the memory region
+                 */
+                vm_start = max(hva, vma->vm_start);
+                vm_end = min(reg_end, vma->vm_end);
+                if (!(vma->vm_flags & VM_PFNMAP)) {
+                        gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+                        unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+                }
+                hva = vm_end;
+        } while (hva < reg_end);
+}
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any reguler RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+        struct kvm_memslots *slots;
+        struct kvm_memory_slot *memslot;
+        int idx;
+        idx = srcu_read_lock(&kvm->srcu);
+        spin_lock(&kvm->mmu_lock);
+        slots = kvm_memslots(kvm);
+        kvm_for_each_memslot(memslot, slots)
+                stage2_unmap_memslot(kvm, memslot);
+        spin_unlock(&kvm->mmu_lock);
+        srcu_read_unlock(&kvm->srcu, idx);
+}
 /**
 * kvm_free_stage2_pgd - free all stage-2 tables
 * @kvm:        The KVM struct pointer for the VM.
@@ -834,6 +899,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
        return kvm_vcpu_dabt_iswrite(vcpu);
 }
+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+        return !pfn_valid(pfn);
+}
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                          struct kvm_memory_slot *memslot, unsigned long hva,
                          unsigned long fault_status)
@@ -847,6 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        struct vm_area_struct *vma;
        pfn_t pfn;
        pgprot_t mem_type = PAGE_S2;
+        bool fault_ipa_uncached;
        write_fault = kvm_is_write_fault(vcpu);
        if (fault_status == FSC_PERM && !write_fault) {
@@ -904,7 +975,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (is_error_pfn(pfn))
                return -EFAULT;
-        if (kvm_is_mmio_pfn(pfn))
+        if (kvm_is_device_pfn(pfn))
                mem_type = PAGE_S2_DEVICE;
        spin_lock(&kvm->mmu_lock);
@@ -913,6 +984,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
        if (!hugetlb && !force_pte)
                hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+        fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
        if (hugetlb) {
                pmd_t new_pmd = pfn_pmd(pfn, mem_type);
                new_pmd = pmd_mkhuge(new_pmd);
@@ -920,7 +993,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        kvm_set_s2pmd_writable(&new_pmd);
                        kvm_set_pfn_dirty(pfn);
                }
-                coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE);
+                coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE,
+                                          fault_ipa_uncached);
                ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
        } else {
                pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -928,7 +1002,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                        kvm_set_s2pte_writable(&new_pte);
                        kvm_set_pfn_dirty(pfn);
                }
-                coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
+                coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
+                                          fault_ipa_uncached);
                ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
                        pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
        }
@@ -1288,11 +1363,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                hva = vm_end;
        } while (hva < reg_end);
-        if (ret) {
+        spin_lock(&kvm->mmu_lock);
-                spin_lock(&kvm->mmu_lock);
+        if (ret)
                unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-                spin_unlock(&kvm->mmu_lock);
+        else
-        }
+                stage2_flush_memslot(kvm, memslot);
+        spin_unlock(&kvm->mmu_lock);
        return ret;
 }
@@ -1304,6 +1380,15 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
                            unsigned long npages)
 {
+        /*
+         * Readonly memslots are not incoherent with the caches by definition,
+         * but in practice, they are used mostly to emulate ROMs or NOR flashes
+         * that the guest may consider devices and hence map as uncached.
+         * To prevent incoherency issues in these cases, tag all readonly
+         * regions as incoherent.
+         */
+        if (slot->flags & KVM_MEM_READONLY)
+                slot->flags |= KVM_MEMSLOT_INCOHERENT;
        return 0;
 }
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 09cf37737ee2..58cb3248d277 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -15,6 +15,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
+#include <linux/preempt.h>
 #include <linux/kvm_host.h>
 #include <linux/wait.h>
@@ -166,6 +167,23 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
 static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 {
+        int i;
+        struct kvm_vcpu *tmp;
+        /*
+         * The KVM ABI specifies that a system event exit may call KVM_RUN
+         * again and may perform shutdown/reboot at a later time that when the
+         * actual request is made.  Since we are implementing PSCI and a
+         * caller of PSCI reboot and shutdown expects that the system shuts
+         * down or reboots immediately, let's make sure that VCPUs are not run
+         * after this call is handled and before the VCPUs have been
+         * re-initialized.
+         */
+        kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+                tmp->arch.pause = true;
+                kvm_vcpu_kick(tmp);
+        }
        memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
        vcpu->run->system_event.type = type;
        vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index 5674a55b5518..8127e45e2637 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -38,6 +38,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+        vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
+}
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
        return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2012c4ba8d67..0b7dfdb931df 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -165,8 +165,6 @@ struct kvm_vcpu_stat {
        u32 halt_wakeup;
 };
-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-                        const struct kvm_vcpu_init *init);
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
@@ -200,6 +198,7 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);
 u64 kvm_call_hyp(void *hypfn, ...);
+void force_vm_exit(const cpumask_t *mask);
 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
                int exception_index);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 0caf7a59f6a1..14a74f136272 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -83,6 +83,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);
+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
@@ -243,9 +244,10 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 }
 static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-                                             unsigned long size)
+                                             unsigned long size,
+                                             bool ipa_uncached)
 {
-        if (!vcpu_has_cache_enabled(vcpu))
+        if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
                kvm_flush_dcache_to_poc((void *)hva, size);
        if (!icache_is_aliasing()) {            /* PIPT */
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 76794692c20b..9535bd555d1d 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-        vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
        return 0;
 }
@@ -297,31 +296,6 @@ int __attribute_const__ kvm_target_cpu(void)
        return -EINVAL;
 }
-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-                        const struct kvm_vcpu_init *init)
-{
-        unsigned int i;
-        int phys_target = kvm_target_cpu();
-        if (init->target != phys_target)
-                return -EINVAL;
-        vcpu->arch.target = phys_target;
-        bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-        /* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-        for (i = 0; i < sizeof(init->features) * 8; i++) {
-                if (init->features[i / 32] & (1 << (i % 32))) {
-                        if (i >= KVM_VCPU_MAX_FEATURES)
-                                return -ENOENT;
-                        set_bit(i, vcpu->arch.features);
-                }
-        }
-        /* Now we know what it is, we can reset it. */
-        return kvm_reset_vcpu(vcpu);
-}
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 {
        int target = kvm_target_cpu();
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c4c88c008ce..d89c6b828c96 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -664,6 +664,16 @@ struct msr_data {
        u64 data;
 };
+struct kvm_lapic_irq {
+        u32 vector;
+        u32 delivery_mode;
+        u32 dest_mode;
+        u32 level;
+        u32 trig_mode;
+        u32 shorthand;
+        u32 dest_id;
+};
 struct kvm_x86_ops {
        int (*cpu_has_kvm_support)(void);          /* __init */
        int (*disabled_by_bios)(void);             /* __init */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f6945bef2cd1..94f643484300 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault);
 static void __init paravirt_ops_setup(void)
 {
        pv_info.name = "KVM";
-        pv_info.paravirt_enabled = 1;
+        /*
+         * KVM isn't paravirt in the sense of paravirt_enabled.  A KVM
+         * guest kernel works like a bare metal kernel with additional
+         * features, and paravirt_enabled is about features that are
+         * missing.
+         */
+        pv_info.paravirt_enabled = 0;
        if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
                pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 0bf3467d7f30..42caaef897c8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -261,7 +261,6 @@ void __init kvmclock_init(void)
 #endif
        kvm_get_preset_lpj();
        clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
-        pv_info.paravirt_enabled = 1;
        pv_info.name = "KVM";
        if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38173343153f..9715d6ea7d72 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1861,7 +1861,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-        ctxt->src.val =  (unsigned long)ctxt->eflags;
+        ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
        return em_push(ctxt);
 }
@@ -2130,7 +2130,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
        /* Outer-privilege level return is not implemented */
        if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
                return X86EMUL_UNHANDLEABLE;
-        rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false,
+        rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
                                       &new_desc);
        if (rc != X86EMUL_CONTINUE)
                return rc;
@@ -4172,8 +4172,8 @@ static const struct opcode opcode_map_0f_38[256] = {
        /* 0x80 - 0xef */
        X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
        /* 0xf0 - 0xf1 */
-        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
+        GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
-        GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+        GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
        /* 0xf2 - 0xff */
        N, N, X4(N), X8(N)
 };
@@ -4801,6 +4801,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                                goto done;
                }
+                /* Instruction can only be executed in protected mode */
+                if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+                        rc = emulate_ud(ctxt);
+                        goto done;
+                }
                /* Privileged instruction can be executed only in CPL=0 */
                if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
                        if (ctxt->d & PrivUD)
@@ -4810,12 +4816,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
                        goto done;
                }
-                /* Instruction can only be executed in protected mode */
-                if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
-                        rc = emulate_ud(ctxt);
-                        goto done;
-                }
                /* Do instruction specific permission checks */
                if (ctxt->d & CheckPerm) {
                        rc = ctxt->check_perm(ctxt);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index deac8d509f2a..3c9195535ffc 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -44,6 +44,23 @@ struct rtc_status {
        DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
 };
+union kvm_ioapic_redirect_entry {
+        u64 bits;
+        struct {
+                u8 vector;
+                u8 delivery_mode:3;
+                u8 dest_mode:1;
+                u8 delivery_status:1;
+                u8 polarity:1;
+                u8 remote_irr:1;
+                u8 trig_mode:1;
+                u8 mask:1;
+                u8 reserve:7;
+                u8 reserved[4];
+                u8 dest_id;
+        } fields;
+};
 struct kvm_ioapic {
        u64 base_address;
        u32 ioregsel;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4ea0dcb0b21b..10fbed126b11 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -629,7 +629,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
         * kvm mmu, before reclaiming the page, we should
         * unmap it from mmu first.
         */
-        WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+        WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
        if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
                kvm_set_pfn_accessed(pfn);
@@ -2460,7 +2460,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                spte |= PT_PAGE_SIZE_MASK;
        if (tdp_enabled)
                spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-                        kvm_is_mmio_pfn(pfn));
+                        kvm_is_reserved_pfn(pfn));
        if (host_writable)
                spte |= SPTE_HOST_WRITEABLE;
@@ -2736,7 +2736,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
         * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
         * here.
         */
-        if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+        if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
            level == PT_PAGE_TABLE_LEVEL &&
            PageTransCompound(pfn_to_page(pfn)) &&
            !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9bcc871f0635..feb852b04598 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2399,13 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-                SECONDARY_EXEC_UNRESTRICTED_GUEST |
                SECONDARY_EXEC_WBINVD_EXITING |
                SECONDARY_EXEC_XSAVES;
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
-                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+                nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;