aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-14 19:07:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-14 19:07:40 -0400
commit1609d7604b847a9820e63393d1a3b6cac7286d40 (patch)
tree993914907707ceff9eb965f8c519f0a91f5ab192 /arch/x86
parent1f9c632cde0c3d781463a88ce430a8dd4a7c1a0e (diff)
parenta9c20bb0206ae9384bd470a6832dd8913730add9 (diff)
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini: "The main change here is a revert of reverts. We recently simplified some code that was thought unnecessary; however, since then KVM has grown quite a few cond_resched()s and for that reason the simplified code is prone to livelocks---one CPUs tries to empty a list of guest page tables while the others keep adding to them. This adds back the generation-based zapping of guest page tables, which was not unnecessary after all. On top of this, there is a fix for a kernel memory leak and a couple of s390 fixlets as well" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot KVM: x86: work around leak of uninitialized stack contents KVM: nVMX: handle page fault in vmread KVM: s390: Do not leak kernel stack data in the KVM_S390_INTERRUPT ioctl KVM: s390: kvm_s390_vm_start_migration: check dirty_bitmap before using it as target for memset()
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/kvm_host.h2
-rw-r--r--arch/x86/kvm/mmu.c101
-rw-r--r--arch/x86/kvm/vmx/nested.c4
-rw-r--r--arch/x86/kvm/x86.c7
4 files changed, 111 insertions, 3 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 74e88e5edd9c..bdc16b0aa7c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,6 +335,7 @@ struct kvm_mmu_page {
335 int root_count; /* Currently serving as active root */ 335 int root_count; /* Currently serving as active root */
336 unsigned int unsync_children; 336 unsigned int unsync_children;
337 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ 337 struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
338 unsigned long mmu_valid_gen;
338 DECLARE_BITMAP(unsync_child_bitmap, 512); 339 DECLARE_BITMAP(unsync_child_bitmap, 512);
339 340
340#ifdef CONFIG_X86_32 341#ifdef CONFIG_X86_32
@@ -856,6 +857,7 @@ struct kvm_arch {
856 unsigned long n_requested_mmu_pages; 857 unsigned long n_requested_mmu_pages;
857 unsigned long n_max_mmu_pages; 858 unsigned long n_max_mmu_pages;
858 unsigned int indirect_shadow_pages; 859 unsigned int indirect_shadow_pages;
860 unsigned long mmu_valid_gen;
859 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 861 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
860 /* 862 /*
861 * Hash table of struct kvm_mmu_page. 863 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 218b277bfda3..a63964e7cec7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
2095 if (!direct) 2095 if (!direct)
2096 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 2096 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2097 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 2097 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2098
2099 /*
2100 * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2101 * depends on valid pages being added to the head of the list. See
2102 * comments in kvm_zap_obsolete_pages().
2103 */
2098 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 2104 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2099 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 2105 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2100 return sp; 2106 return sp;
@@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2244#define for_each_valid_sp(_kvm, _sp, _gfn) \ 2250#define for_each_valid_sp(_kvm, _sp, _gfn) \
2245 hlist_for_each_entry(_sp, \ 2251 hlist_for_each_entry(_sp, \
2246 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 2252 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2247 if ((_sp)->role.invalid) { \ 2253 if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
2248 } else 2254 } else
2249 2255
2250#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 2256#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
@@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2301static void mmu_audit_disable(void) { } 2307static void mmu_audit_disable(void) { }
2302#endif 2308#endif
2303 2309
2310static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2311{
2312 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2313}
2314
2304static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2315static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2305 struct list_head *invalid_list) 2316 struct list_head *invalid_list)
2306{ 2317{
@@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2525 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2536 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2526 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); 2537 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2527 } 2538 }
2539 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2528 clear_page(sp->spt); 2540 clear_page(sp->spt);
2529 trace_kvm_mmu_get_page(sp, true); 2541 trace_kvm_mmu_get_page(sp, true);
2530 2542
@@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4233 return false; 4245 return false;
4234 4246
4235 if (cached_root_available(vcpu, new_cr3, new_role)) { 4247 if (cached_root_available(vcpu, new_cr3, new_role)) {
4248 /*
4249 * It is possible that the cached previous root page is
4250 * obsolete because of a change in the MMU generation
4251 * number. However, changing the generation number is
4252 * accompanied by KVM_REQ_MMU_RELOAD, which will free
4253 * the root set here and allocate a new one.
4254 */
4236 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); 4255 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4237 if (!skip_tlb_flush) { 4256 if (!skip_tlb_flush) {
4238 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 4257 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5649,11 +5668,89 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
5649 return alloc_mmu_pages(vcpu); 5668 return alloc_mmu_pages(vcpu);
5650} 5669}
5651 5670
5671
5672static void kvm_zap_obsolete_pages(struct kvm *kvm)
5673{
5674 struct kvm_mmu_page *sp, *node;
5675 LIST_HEAD(invalid_list);
5676 int ign;
5677
5678restart:
5679 list_for_each_entry_safe_reverse(sp, node,
5680 &kvm->arch.active_mmu_pages, link) {
5681 /*
5682 * No obsolete valid page exists before a newly created page
5683 * since active_mmu_pages is a FIFO list.
5684 */
5685 if (!is_obsolete_sp(kvm, sp))
5686 break;
5687
5688 /*
5689 * Do not repeatedly zap a root page to avoid unnecessary
5690 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
5691 * progress:
5692 * vcpu 0 vcpu 1
5693 * call vcpu_enter_guest():
5694 * 1): handle KVM_REQ_MMU_RELOAD
5695 * and require mmu-lock to
5696 * load mmu
5697 * repeat:
5698 * 1): zap root page and
5699 * send KVM_REQ_MMU_RELOAD
5700 *
5701 * 2): if (cond_resched_lock(mmu-lock))
5702 *
5703 * 2): hold mmu-lock and load mmu
5704 *
5705 * 3): see KVM_REQ_MMU_RELOAD bit
5706 * on vcpu->requests is set
5707 * then return 1 to call
5708 * vcpu_enter_guest() again.
5709 * goto repeat;
5710 *
5711 * Since we are reversely walking the list and the invalid
5712 * list will be moved to the head, skip the invalid page
5713 * can help us to avoid the infinity list walking.
5714 */
5715 if (sp->role.invalid)
5716 continue;
5717
5718 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5719 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5720 cond_resched_lock(&kvm->mmu_lock);
5721 goto restart;
5722 }
5723
5724 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5725 goto restart;
5726 }
5727
5728 kvm_mmu_commit_zap_page(kvm, &invalid_list);
5729}
5730
5731/*
5732 * Fast invalidate all shadow pages and use lock-break technique
5733 * to zap obsolete pages.
5734 *
5735 * It's required when memslot is being deleted or VM is being
5736 * destroyed, in these cases, we should ensure that KVM MMU does
5737 * not use any resource of the being-deleted slot or all slots
5738 * after calling the function.
5739 */
5740static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5741{
5742 spin_lock(&kvm->mmu_lock);
5743 kvm->arch.mmu_valid_gen++;
5744
5745 kvm_zap_obsolete_pages(kvm);
5746 spin_unlock(&kvm->mmu_lock);
5747}
5748
5652static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, 5749static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5653 struct kvm_memory_slot *slot, 5750 struct kvm_memory_slot *slot,
5654 struct kvm_page_track_notifier_node *node) 5751 struct kvm_page_track_notifier_node *node)
5655{ 5752{
5656 kvm_mmu_zap_all(kvm); 5753 kvm_mmu_zap_all_fast(kvm);
5657} 5754}
5658 5755
5659void kvm_mmu_init_vm(struct kvm *kvm) 5756void kvm_mmu_init_vm(struct kvm *kvm)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ced9fba32598..a3cba321b5c5 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4540,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
4540 int len; 4540 int len;
4541 gva_t gva = 0; 4541 gva_t gva = 0;
4542 struct vmcs12 *vmcs12; 4542 struct vmcs12 *vmcs12;
4543 struct x86_exception e;
4543 short offset; 4544 short offset;
4544 4545
4545 if (!nested_vmx_check_permission(vcpu)) 4546 if (!nested_vmx_check_permission(vcpu))
@@ -4588,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
4588 vmx_instruction_info, true, len, &gva)) 4589 vmx_instruction_info, true, len, &gva))
4589 return 1; 4590 return 1;
4590 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 4591 /* _system ok, nested_vmx_check_permission has verified cpl=0 */
4591 kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); 4592 if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4593 kvm_inject_page_fault(vcpu, &e);
4592 } 4594 }
4593 4595
4594 return nested_vmx_succeed(vcpu); 4596 return nested_vmx_succeed(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 290c3c3efb87..91602d310a3f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5312,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
5312 /* kvm_write_guest_virt_system can pull in tons of pages. */ 5312 /* kvm_write_guest_virt_system can pull in tons of pages. */
5313 vcpu->arch.l1tf_flush_l1d = true; 5313 vcpu->arch.l1tf_flush_l1d = true;
5314 5314
5315 /*
5316 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5317 * is returned, but our callers are not ready for that and they blindly
5318 * call kvm_inject_page_fault. Ensure that they at least do not leak
5319 * uninitialized kernel stack memory into cr2 and error code.
5320 */
5321 memset(exception, 0, sizeof(*exception));
5315 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 5322 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5316 PFERR_WRITE_MASK, exception); 5323 PFERR_WRITE_MASK, exception);
5317} 5324}