1 files changed, 120 insertions, 64 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c243b81e3c74..f71500af1f81 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
        kvm_x86_ops->set_efer(vcpu, efer);
-        vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
        /* Update reserved bits */
        if ((efer ^ old_efer) & EFER_NX)
                kvm_mmu_reset_context(vcpu);
@@ -1881,6 +1879,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        u64 data = msr_info->data;
        switch (msr) {
+        case MSR_AMD64_NB_CFG:
+        case MSR_IA32_UCODE_REV:
+        case MSR_IA32_UCODE_WRITE:
+        case MSR_VM_HSAVE_PA:
+        case MSR_AMD64_PATCH_LOADER:
+        case MSR_AMD64_BU_CFG2:
+                break;
        case MSR_EFER:
                return set_efer(vcpu, data);
        case MSR_K7_HWCR:
@@ -1900,8 +1906,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                }
                break;
-        case MSR_AMD64_NB_CFG:
-                break;
        case MSR_IA32_DEBUGCTLMSR:
                if (!data) {
                        /* We support the non-activated case already */
@@ -1914,11 +1918,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
                            __func__, data);
                break;
-        case MSR_IA32_UCODE_REV:
-        case MSR_IA32_UCODE_WRITE:
-        case MSR_VM_HSAVE_PA:
-        case MSR_AMD64_PATCH_LOADER:
-                break;
        case 0x200 ... 0x2ff:
                return set_msr_mtrr(vcpu, msr, data);
        case MSR_IA32_APICBASE:
@@ -2253,6 +2252,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
        case MSR_K8_INT_PENDING_MSG:
        case MSR_AMD64_NB_CFG:
        case MSR_FAM10H_MMIO_CONF_BASE:
+        case MSR_AMD64_BU_CFG2:
                data = 0;
                break;
        case MSR_P6_PERFCTR0:
@@ -2520,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
                r = KVM_MAX_VCPUS;
                break;
        case KVM_CAP_NR_MEMSLOTS:
-                r = KVM_MEMORY_SLOTS;
+                r = KVM_USER_MEM_SLOTS;
                break;
        case KVM_CAP_PV_MMU:    /* obsolete */
                r = 0;
@@ -3272,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
                return -EINVAL;
        mutex_lock(&kvm->slots_lock);
-        spin_lock(&kvm->mmu_lock);
        kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
        kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
-        spin_unlock(&kvm->mmu_lock);
        mutex_unlock(&kvm->slots_lock);
        return 0;
 }
@@ -3437,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
        mutex_lock(&kvm->slots_lock);
        r = -EINVAL;
-        if (log->slot >= KVM_MEMORY_SLOTS)
+        if (log->slot >= KVM_USER_MEM_SLOTS)
                goto out;
        memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4493,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
        kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
        *selector = var.selector;
-        if (var.unusable)
+        if (var.unusable) {
+                memset(desc, 0, sizeof(*desc));
                return false;
+        }
        if (var.g)
                var.limit >>= 12;
@@ -4755,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
        return r;
 }
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+                                  bool write_fault_to_shadow_pgtable)
 {
-        gpa_t gpa;
+        gpa_t gpa = cr2;
        pfn_t pfn;
-        if (tdp_enabled)
+        if (!vcpu->arch.mmu.direct_map) {
-                return false;
+                /*
+                 * Write permission should be allowed since only
-        /*
+                 * write access need to be emulated.
-         * if emulation was due to access to shadowed page table
+                 */
-         * and it failed try to unshadow page and re-enter the
+                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
-         * guest to let CPU execute the instruction.
-         */
-        if (kvm_mmu_unprotect_page_virt(vcpu, gva))
-                return true;
-        gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
-        if (gpa == UNMAPPED_GVA)
+                /*
-                return true; /* let cpu generate fault */
+                 * If the mapping is invalid in guest, let cpu retry
+                 * it to generate fault.
+                 */
+                if (gpa == UNMAPPED_GVA)
+                        return true;
+        }
        /*
         * Do not retry the unhandleable instruction if it faults on the
@@ -4783,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
         * instruction -> ...
         */
        pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
-        if (!is_error_noslot_pfn(pfn)) {
-                kvm_release_pfn_clean(pfn);
+        /*
+         * If the instruction failed on the error pfn, it can not be fixed,
+         * report the error to userspace.
+         */
+        if (is_error_noslot_pfn(pfn))
+                return false;
+        kvm_release_pfn_clean(pfn);
+        /* The instructions are well-emulated on direct mmu. */
+        if (vcpu->arch.mmu.direct_map) {
+                unsigned int indirect_shadow_pages;
+                spin_lock(&vcpu->kvm->mmu_lock);
+                indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
+                spin_unlock(&vcpu->kvm->mmu_lock);
+                if (indirect_shadow_pages)
+                        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
                return true;
        }
-        return false;
+        /*
+         * if emulation was due to access to shadowed page table
+         * and it failed try to unshadow page and re-enter the
+         * guest to let CPU execute the instruction.
+         */
+        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
+        /*
+         * If the access faults on its page table, it can not
+         * be fixed by unprotecting shadow page and it should
+         * be reported to userspace.
+         */
+        return !write_fault_to_shadow_pgtable;
 }
 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4830,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
        if (!vcpu->arch.mmu.direct_map)
                gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
-        kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+        kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
        return true;
 }
@@ -4847,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
        int r;
        struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
        bool writeback = true;
+        bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+        /*
+         * Clear write_fault_to_shadow_pgtable here to ensure it is
+         * never reused.
+         */
+        vcpu->arch.write_fault_to_shadow_pgtable = false;
        kvm_clear_exception_queue(vcpu);
        if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4866,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
                if (r != EMULATION_OK)  {
                        if (emulation_type & EMULTYPE_TRAP_UD)
                                return EMULATE_FAIL;
-                        if (reexecute_instruction(vcpu, cr2))
+                        if (reexecute_instruction(vcpu, cr2,
+                                                  write_fault_to_spt))
                                return EMULATE_DONE;
                        if (emulation_type & EMULTYPE_SKIP)
                                return EMULATE_FAIL;
@@ -4896,7 +4934,7 @@ restart:
                return EMULATE_DONE;
        if (r == EMULATION_FAILED) {
-                if (reexecute_instruction(vcpu, cr2))
+                if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
                        return EMULATE_DONE;
                return handle_emulation_failure(vcpu);
@@ -5539,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
                        vcpu->arch.nmi_injected = true;
                        kvm_x86_ops->set_nmi(vcpu);
                }
-        } else if (kvm_cpu_has_interrupt(vcpu)) {
+        } else if (kvm_cpu_has_injectable_intr(vcpu)) {
                if (kvm_x86_ops->interrupt_allowed(vcpu)) {
                        kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
                                            false);
@@ -5607,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
+static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+{
+        u64 eoi_exit_bitmap[4];
+        memset(eoi_exit_bitmap, 0, 32);
+        kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+        kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+}
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
        int r;
@@ -5660,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                        kvm_handle_pmu_event(vcpu);
                if (kvm_check_request(KVM_REQ_PMI, vcpu))
                        kvm_deliver_pmi(vcpu);
+                if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
+                        update_eoi_exitmap(vcpu);
        }
        if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -5668,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
                /* enable NMI/IRQ window open exits if needed */
                if (vcpu->arch.nmi_pending)
                        kvm_x86_ops->enable_nmi_window(vcpu);
-                else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+                else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
                        kvm_x86_ops->enable_irq_window(vcpu);
                if (kvm_lapic_enabled(vcpu)) {
+                        /*
+                         * Update architecture specific hints for APIC
+                         * virtual interrupt delivery.
+                         */
+                        if (kvm_x86_ops->hwapic_irr_update)
+                                kvm_x86_ops->hwapic_irr_update(vcpu,
+                                        kvm_lapic_find_highest_irr(vcpu));
                        update_cr8_intercept(vcpu);
                        kvm_lapic_sync_to_vapic(vcpu);
                }
@@ -6851,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *memslot,
                                struct kvm_memory_slot old,
                                struct kvm_userspace_memory_region *mem,
-                                int user_alloc)
+                                bool user_alloc)
 {
        int npages = memslot->npages;
-        int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
-        /* Prevent internal slot pages from being moved by fork()/COW. */
-        if (memslot->id >= KVM_MEMORY_SLOTS)
-                map_flags = MAP_SHARED | MAP_ANONYMOUS;
-        /*To keep backward compatibility with older userspace,
+        /*
-         *x86 needs to handle !user_alloc case.
+         * Only private memory slots need to be mapped here since
+         * KVM_SET_MEMORY_REGION ioctl is no longer supported.
         */
-        if (!user_alloc) {
+        if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
-                if (npages && !old.npages) {
+                unsigned long userspace_addr;
-                        unsigned long userspace_addr;
-                        userspace_addr = vm_mmap(NULL, 0,
+                /*
-                                                 npages * PAGE_SIZE,
+                 * MAP_SHARED to prevent internal slot pages from being moved
-                                                 PROT_READ | PROT_WRITE,
+                 * by fork()/COW.
-                                                 map_flags,
+                 */
-                                                 0);
+                userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+                                         PROT_READ | PROT_WRITE,
+                                         MAP_SHARED | MAP_ANONYMOUS, 0);
-                        if (IS_ERR((void *)userspace_addr))
+                if (IS_ERR((void *)userspace_addr))
-                                return PTR_ERR((void *)userspace_addr);
+                        return PTR_ERR((void *)userspace_addr);
-                        memslot->userspace_addr = userspace_addr;
+                memslot->userspace_addr = userspace_addr;
-                }
        }
        return 0;
 }
 void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_userspace_memory_region *mem,
                                struct kvm_memory_slot old,
-                                int user_alloc)
+                                bool user_alloc)
 {
        int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
-        if (!user_alloc && !old.user_alloc && old.npages && !npages) {
+        if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
                int ret;
                ret = vm_munmap(old.userspace_addr,
@@ -6906,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
        if (!kvm->arch.n_requested_mmu_pages)
                nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-        spin_lock(&kvm->mmu_lock);
        if (nr_mmu_pages)
                kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
-        kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+        /*
-        spin_unlock(&kvm->mmu_lock);
+         * Write protect all pages for dirty logging.
+         * Existing largepage mappings are destroyed here and new ones will
+         * not be created until the end of the logging.
+         */
+        if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+                kvm_mmu_slot_remove_write_access(kvm, mem->slot);
        /*
         * If memory slot is created, or moved, we need to clear all
         * mmio sptes.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c243b81e3c74..f71500af1f81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c
@@ -872,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
872		872
873	kvm_x86_ops->set_efer(vcpu, efer);	873	kvm_x86_ops->set_efer(vcpu, efer);
874		874
875	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
876
877	/* Update reserved bits */	875	/* Update reserved bits */
878	if ((efer ^ old_efer) & EFER_NX)	876	if ((efer ^ old_efer) & EFER_NX)
879	kvm_mmu_reset_context(vcpu);	877	kvm_mmu_reset_context(vcpu);
@@ -1881,6 +1879,14 @@ int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
1881	u64 data = msr_info->data;	1879	u64 data = msr_info->data;
1882		1880
1883	switch (msr) {	1881	switch (msr) {
		1882	case MSR_AMD64_NB_CFG:
		1883	case MSR_IA32_UCODE_REV:
		1884	case MSR_IA32_UCODE_WRITE:
		1885	case MSR_VM_HSAVE_PA:
		1886	case MSR_AMD64_PATCH_LOADER:
		1887	case MSR_AMD64_BU_CFG2:
		1888	break;
		1889
1884	case MSR_EFER:	1890	case MSR_EFER:
1885	return set_efer(vcpu, data);	1891	return set_efer(vcpu, data);
1886	case MSR_K7_HWCR:	1892	case MSR_K7_HWCR:
@@ -1900,8 +1906,6 @@ int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
1900	return 1;	1906	return 1;
1901	}	1907	}
1902	break;	1908	break;
1903	case MSR_AMD64_NB_CFG:
1904	break;
1905	case MSR_IA32_DEBUGCTLMSR:	1909	case MSR_IA32_DEBUGCTLMSR:
1906	if (!data) {	1910	if (!data) {
1907	/* We support the non-activated case already */	1911	/* We support the non-activated case already */
@@ -1914,11 +1918,6 @@ int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
1914	vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",	1918	vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
1915	__func__, data);	1919	__func__, data);
1916	break;	1920	break;
1917	case MSR_IA32_UCODE_REV:
1918	case MSR_IA32_UCODE_WRITE:
1919	case MSR_VM_HSAVE_PA:
1920	case MSR_AMD64_PATCH_LOADER:
1921	break;
1922	case 0x200 ... 0x2ff:	1921	case 0x200 ... 0x2ff:
1923	return set_msr_mtrr(vcpu, msr, data);	1922	return set_msr_mtrr(vcpu, msr, data);
1924	case MSR_IA32_APICBASE:	1923	case MSR_IA32_APICBASE:
@@ -2253,6 +2252,7 @@ int kvm_get_msr_common(struct kvm_vcpu vcpu, u32 msr, u64 pdata)
2253	case MSR_K8_INT_PENDING_MSG:	2252	case MSR_K8_INT_PENDING_MSG:
2254	case MSR_AMD64_NB_CFG:	2253	case MSR_AMD64_NB_CFG:
2255	case MSR_FAM10H_MMIO_CONF_BASE:	2254	case MSR_FAM10H_MMIO_CONF_BASE:
		2255	case MSR_AMD64_BU_CFG2:
2256	data = 0;	2256	data = 0;
2257	break;	2257	break;
2258	case MSR_P6_PERFCTR0:	2258	case MSR_P6_PERFCTR0:
@@ -2520,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext)
2520	r = KVM_MAX_VCPUS;	2520	r = KVM_MAX_VCPUS;
2521	break;	2521	break;
2522	case KVM_CAP_NR_MEMSLOTS:	2522	case KVM_CAP_NR_MEMSLOTS:
2523	r = KVM_MEMORY_SLOTS;	2523	r = KVM_USER_MEM_SLOTS;
2524	break;	2524	break;
2525	case KVM_CAP_PV_MMU: /* obsolete */	2525	case KVM_CAP_PV_MMU: /* obsolete */
2526	r = 0;	2526	r = 0;
@@ -3272,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3272	return -EINVAL;	3272	return -EINVAL;
3273		3273
3274	mutex_lock(&kvm->slots_lock);	3274	mutex_lock(&kvm->slots_lock);
3275	spin_lock(&kvm->mmu_lock);
3276		3275
3277	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);	3276	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3278	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;	3277	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3279		3278
3280	spin_unlock(&kvm->mmu_lock);
3281	mutex_unlock(&kvm->slots_lock);	3279	mutex_unlock(&kvm->slots_lock);
3282	return 0;	3280	return 0;
3283	}	3281	}
@@ -3437,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log)
3437	mutex_lock(&kvm->slots_lock);	3435	mutex_lock(&kvm->slots_lock);
3438		3436
3439	r = -EINVAL;	3437	r = -EINVAL;
3440	if (log->slot >= KVM_MEMORY_SLOTS)	3438	if (log->slot >= KVM_USER_MEM_SLOTS)
3441	goto out;	3439	goto out;
3442		3440
3443	memslot = id_to_memslot(kvm->memslots, log->slot);	3441	memslot = id_to_memslot(kvm->memslots, log->slot);
@@ -4493,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt ctxt, u16 selector,
4493	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);	4491	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4494	*selector = var.selector;	4492	*selector = var.selector;
4495		4493
4496	if (var.unusable)	4494	if (var.unusable) {
		4495	memset(desc, 0, sizeof(*desc));
4497	return false;	4496	return false;
		4497	}
4498		4498
4499	if (var.g)	4499	if (var.g)
4500	var.limit >>= 12;	4500	var.limit >>= 12;
@@ -4755,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
4755	return r;	4755	return r;
4756	}	4756	}
4757		4757
4758	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)	4758	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
		4759	bool write_fault_to_shadow_pgtable)
4759	{	4760	{
4760	gpa_t gpa;	4761	gpa_t gpa = cr2;
4761	pfn_t pfn;	4762	pfn_t pfn;
4762		4763
4763	if (tdp_enabled)	4764	if (!vcpu->arch.mmu.direct_map) {
4764	return false;	4765	/*
4765		4766	* Write permission should be allowed since only
4766	/*	4767	* write access need to be emulated.
4767	* if emulation was due to access to shadowed page table	4768	*/
4768	* and it failed try to unshadow page and re-enter the	4769	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4769	* guest to let CPU execute the instruction.
4770	*/
4771	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
4772	return true;
4773
4774	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
4775		4770
4776	if (gpa == UNMAPPED_GVA)	4771	/*
4777	return true; /* let cpu generate fault */	4772	* If the mapping is invalid in guest, let cpu retry
		4773	* it to generate fault.
		4774	*/
		4775	if (gpa == UNMAPPED_GVA)
		4776	return true;
		4777	}
4778		4778
4779	/*	4779	/*
4780	* Do not retry the unhandleable instruction if it faults on the	4780	* Do not retry the unhandleable instruction if it faults on the
@@ -4783,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
4783	* instruction -> ...	4783	* instruction -> ...
4784	*/	4784	*/
4785	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));	4785	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
4786	if (!is_error_noslot_pfn(pfn)) {	4786
4787	kvm_release_pfn_clean(pfn);	4787	/*
		4788	* If the instruction failed on the error pfn, it can not be fixed,
		4789	* report the error to userspace.
		4790	*/
		4791	if (is_error_noslot_pfn(pfn))
		4792	return false;
		4793
		4794	kvm_release_pfn_clean(pfn);
		4795
		4796	/* The instructions are well-emulated on direct mmu. */
		4797	if (vcpu->arch.mmu.direct_map) {
		4798	unsigned int indirect_shadow_pages;
		4799
		4800	spin_lock(&vcpu->kvm->mmu_lock);
		4801	indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
		4802	spin_unlock(&vcpu->kvm->mmu_lock);
		4803
		4804	if (indirect_shadow_pages)
		4805	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
		4806
4788	return true;	4807	return true;
4789	}	4808	}
4790		4809
4791	return false;	4810	/*
		4811	* if emulation was due to access to shadowed page table
		4812	* and it failed try to unshadow page and re-enter the
		4813	* guest to let CPU execute the instruction.
		4814	*/
		4815	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
		4816
		4817	/*
		4818	* If the access faults on its page table, it can not
		4819	* be fixed by unprotecting shadow page and it should
		4820	* be reported to userspace.
		4821	*/
		4822	return !write_fault_to_shadow_pgtable;
4792	}	4823	}
4793		4824
4794	static bool retry_instruction(struct x86_emulate_ctxt *ctxt,	4825	static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
@@ -4830,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
4830	if (!vcpu->arch.mmu.direct_map)	4861	if (!vcpu->arch.mmu.direct_map)
4831	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);	4862	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
4832		4863
4833	kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);	4864	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
4834		4865
4835	return true;	4866	return true;
4836	}	4867	}
@@ -4847,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4847	int r;	4878	int r;
4848	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;	4879	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
4849	bool writeback = true;	4880	bool writeback = true;
		4881	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
4850		4882
		4883	/*
		4884	* Clear write_fault_to_shadow_pgtable here to ensure it is
		4885	* never reused.
		4886	*/
		4887	vcpu->arch.write_fault_to_shadow_pgtable = false;
4851	kvm_clear_exception_queue(vcpu);	4888	kvm_clear_exception_queue(vcpu);
4852		4889
4853	if (!(emulation_type & EMULTYPE_NO_DECODE)) {	4890	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
@@ -4866,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4866	if (r != EMULATION_OK) {	4903	if (r != EMULATION_OK) {
4867	if (emulation_type & EMULTYPE_TRAP_UD)	4904	if (emulation_type & EMULTYPE_TRAP_UD)
4868	return EMULATE_FAIL;	4905	return EMULATE_FAIL;
4869	if (reexecute_instruction(vcpu, cr2))	4906	if (reexecute_instruction(vcpu, cr2,
		4907	write_fault_to_spt))
4870	return EMULATE_DONE;	4908	return EMULATE_DONE;
4871	if (emulation_type & EMULTYPE_SKIP)	4909	if (emulation_type & EMULTYPE_SKIP)
4872	return EMULATE_FAIL;	4910	return EMULATE_FAIL;
@@ -4896,7 +4934,7 @@ restart:
4896	return EMULATE_DONE;	4934	return EMULATE_DONE;
4897		4935
4898	if (r == EMULATION_FAILED) {	4936	if (r == EMULATION_FAILED) {
4899	if (reexecute_instruction(vcpu, cr2))	4937	if (reexecute_instruction(vcpu, cr2, write_fault_to_spt))
4900	return EMULATE_DONE;	4938	return EMULATE_DONE;
4901		4939
4902	return handle_emulation_failure(vcpu);	4940	return handle_emulation_failure(vcpu);
@@ -5539,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
5539	vcpu->arch.nmi_injected = true;	5577	vcpu->arch.nmi_injected = true;
5540	kvm_x86_ops->set_nmi(vcpu);	5578	kvm_x86_ops->set_nmi(vcpu);
5541	}	5579	}
5542	} else if (kvm_cpu_has_interrupt(vcpu)) {	5580	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
5543	if (kvm_x86_ops->interrupt_allowed(vcpu)) {	5581	if (kvm_x86_ops->interrupt_allowed(vcpu)) {
5544	kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),	5582	kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
5545	false);	5583	false);
@@ -5607,6 +5645,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
5607	#endif	5645	#endif
5608	}	5646	}
5609		5647
		5648	static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
		5649	{
		5650	u64 eoi_exit_bitmap[4];
		5651
		5652	memset(eoi_exit_bitmap, 0, 32);
		5653
		5654	kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
		5655	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
		5656	}
		5657
5610	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)	5658	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5611	{	5659	{
5612	int r;	5660	int r;
@@ -5660,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5660	kvm_handle_pmu_event(vcpu);	5708	kvm_handle_pmu_event(vcpu);
5661	if (kvm_check_request(KVM_REQ_PMI, vcpu))	5709	if (kvm_check_request(KVM_REQ_PMI, vcpu))
5662	kvm_deliver_pmi(vcpu);	5710	kvm_deliver_pmi(vcpu);
		5711	if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
		5712	update_eoi_exitmap(vcpu);
5663	}	5713	}
5664		5714
5665	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {	5715	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {
@@ -5668,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5668	/* enable NMI/IRQ window open exits if needed */	5718	/* enable NMI/IRQ window open exits if needed */
5669	if (vcpu->arch.nmi_pending)	5719	if (vcpu->arch.nmi_pending)
5670	kvm_x86_ops->enable_nmi_window(vcpu);	5720	kvm_x86_ops->enable_nmi_window(vcpu);
5671	else if (kvm_cpu_has_interrupt(vcpu) \|\| req_int_win)	5721	else if (kvm_cpu_has_injectable_intr(vcpu) \|\| req_int_win)
5672	kvm_x86_ops->enable_irq_window(vcpu);	5722	kvm_x86_ops->enable_irq_window(vcpu);
5673		5723
5674	if (kvm_lapic_enabled(vcpu)) {	5724	if (kvm_lapic_enabled(vcpu)) {
		5725	/*
		5726	* Update architecture specific hints for APIC
		5727	* virtual interrupt delivery.
		5728	*/
		5729	if (kvm_x86_ops->hwapic_irr_update)
		5730	kvm_x86_ops->hwapic_irr_update(vcpu,
		5731	kvm_lapic_find_highest_irr(vcpu));
5675	update_cr8_intercept(vcpu);	5732	update_cr8_intercept(vcpu);
5676	kvm_lapic_sync_to_vapic(vcpu);	5733	kvm_lapic_sync_to_vapic(vcpu);
5677	}	5734	}
@@ -6851,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
6851	struct kvm_memory_slot *memslot,	6908	struct kvm_memory_slot *memslot,
6852	struct kvm_memory_slot old,	6909	struct kvm_memory_slot old,
6853	struct kvm_userspace_memory_region *mem,	6910	struct kvm_userspace_memory_region *mem,
6854	int user_alloc)	6911	bool user_alloc)
6855	{	6912	{
6856	int npages = memslot->npages;	6913	int npages = memslot->npages;
6857	int map_flags = MAP_PRIVATE \| MAP_ANONYMOUS;
6858
6859	/* Prevent internal slot pages from being moved by fork()/COW. */
6860	if (memslot->id >= KVM_MEMORY_SLOTS)
6861	map_flags = MAP_SHARED \| MAP_ANONYMOUS;
6862		6914
6863	/*To keep backward compatibility with older userspace,	6915	/*
6864	*x86 needs to handle !user_alloc case.	6916	* Only private memory slots need to be mapped here since
		6917	* KVM_SET_MEMORY_REGION ioctl is no longer supported.
6865	*/	6918	*/
6866	if (!user_alloc) {	6919	if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
6867	if (npages && !old.npages) {	6920	unsigned long userspace_addr;
6868	unsigned long userspace_addr;
6869		6921
6870	userspace_addr = vm_mmap(NULL, 0,	6922	/*
6871	npages * PAGE_SIZE,	6923	* MAP_SHARED to prevent internal slot pages from being moved
6872	PROT_READ \| PROT_WRITE,	6924	* by fork()/COW.
6873	map_flags,	6925	*/
6874	0);	6926	userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
		6927	PROT_READ \| PROT_WRITE,
		6928	MAP_SHARED \| MAP_ANONYMOUS, 0);
6875		6929
6876	if (IS_ERR((void *)userspace_addr))	6930	if (IS_ERR((void *)userspace_addr))
6877	return PTR_ERR((void *)userspace_addr);	6931	return PTR_ERR((void *)userspace_addr);
6878		6932
6879	memslot->userspace_addr = userspace_addr;	6933	memslot->userspace_addr = userspace_addr;
6880	}
6881	}	6934	}
6882		6935
6883
6884	return 0;	6936	return 0;
6885	}	6937	}
6886		6938
6887	void kvm_arch_commit_memory_region(struct kvm *kvm,	6939	void kvm_arch_commit_memory_region(struct kvm *kvm,
6888	struct kvm_userspace_memory_region *mem,	6940	struct kvm_userspace_memory_region *mem,
6889	struct kvm_memory_slot old,	6941	struct kvm_memory_slot old,
6890	int user_alloc)	6942	bool user_alloc)
6891	{	6943	{
6892		6944
6893	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;	6945	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
6894		6946
6895	if (!user_alloc && !old.user_alloc && old.npages && !npages) {	6947	if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
6896	int ret;	6948	int ret;
6897		6949
6898	ret = vm_munmap(old.userspace_addr,	6950	ret = vm_munmap(old.userspace_addr,
@@ -6906,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
6906	if (!kvm->arch.n_requested_mmu_pages)	6958	if (!kvm->arch.n_requested_mmu_pages)
6907	nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);	6959	nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6908		6960
6909	spin_lock(&kvm->mmu_lock);
6910	if (nr_mmu_pages)	6961	if (nr_mmu_pages)
6911	kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);	6962	kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
6912	kvm_mmu_slot_remove_write_access(kvm, mem->slot);	6963	/*
6913	spin_unlock(&kvm->mmu_lock);	6964	* Write protect all pages for dirty logging.
		6965	* Existing largepage mappings are destroyed here and new ones will
		6966	* not be created until the end of the logging.
		6967	*/
		6968	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
		6969	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
6914	/*	6970	/*
6915	* If memory slot is created, or moved, we need to clear all	6971	* If memory slot is created, or moved, we need to clear all
6916	* mmio sptes.	6972	* mmio sptes.