diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-14 19:07:40 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-14 19:07:40 -0400 |
| commit | 1609d7604b847a9820e63393d1a3b6cac7286d40 (patch) | |
| tree | 993914907707ceff9eb965f8c519f0a91f5ab192 /arch/x86 | |
| parent | 1f9c632cde0c3d781463a88ce430a8dd4a7c1a0e (diff) | |
| parent | a9c20bb0206ae9384bd470a6832dd8913730add9 (diff) | |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm fixes from Paolo Bonzini:
"The main change here is a revert of reverts. We recently simplified
some code that was thought unnecessary; however, since then KVM has
grown quite a few cond_resched()s and for that reason the simplified
code is prone to livelocks---one CPUs tries to empty a list of guest
page tables while the others keep adding to them. This adds back the
generation-based zapping of guest page tables, which was not
unnecessary after all.
On top of this, there is a fix for a kernel memory leak and a couple
of s390 fixlets as well"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
KVM: x86/mmu: Reintroduce fast invalidate/zap for flushing memslot
KVM: x86: work around leak of uninitialized stack contents
KVM: nVMX: handle page fault in vmread
KVM: s390: Do not leak kernel stack data in the KVM_S390_INTERRUPT ioctl
KVM: s390: kvm_s390_vm_start_migration: check dirty_bitmap before using it as target for memset()
Diffstat (limited to 'arch/x86')
| -rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 101 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx/nested.c | 4 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 7 |
4 files changed, 111 insertions, 3 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74e88e5edd9c..bdc16b0aa7c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
| @@ -335,6 +335,7 @@ struct kvm_mmu_page { | |||
| 335 | int root_count; /* Currently serving as active root */ | 335 | int root_count; /* Currently serving as active root */ |
| 336 | unsigned int unsync_children; | 336 | unsigned int unsync_children; |
| 337 | struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ | 337 | struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ |
| 338 | unsigned long mmu_valid_gen; | ||
| 338 | DECLARE_BITMAP(unsync_child_bitmap, 512); | 339 | DECLARE_BITMAP(unsync_child_bitmap, 512); |
| 339 | 340 | ||
| 340 | #ifdef CONFIG_X86_32 | 341 | #ifdef CONFIG_X86_32 |
| @@ -856,6 +857,7 @@ struct kvm_arch { | |||
| 856 | unsigned long n_requested_mmu_pages; | 857 | unsigned long n_requested_mmu_pages; |
| 857 | unsigned long n_max_mmu_pages; | 858 | unsigned long n_max_mmu_pages; |
| 858 | unsigned int indirect_shadow_pages; | 859 | unsigned int indirect_shadow_pages; |
| 860 | unsigned long mmu_valid_gen; | ||
| 859 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; | 861 | struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; |
| 860 | /* | 862 | /* |
| 861 | * Hash table of struct kvm_mmu_page. | 863 | * Hash table of struct kvm_mmu_page. |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 218b277bfda3..a63964e7cec7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
| @@ -2095,6 +2095,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct | |||
| 2095 | if (!direct) | 2095 | if (!direct) |
| 2096 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); | 2096 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
| 2097 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 2097 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
| 2098 | |||
| 2099 | /* | ||
| 2100 | * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages() | ||
| 2101 | * depends on valid pages being added to the head of the list. See | ||
| 2102 | * comments in kvm_zap_obsolete_pages(). | ||
| 2103 | */ | ||
| 2098 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 2104 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
| 2099 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); | 2105 | kvm_mod_used_mmu_pages(vcpu->kvm, +1); |
| 2100 | return sp; | 2106 | return sp; |
| @@ -2244,7 +2250,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, | |||
| 2244 | #define for_each_valid_sp(_kvm, _sp, _gfn) \ | 2250 | #define for_each_valid_sp(_kvm, _sp, _gfn) \ |
| 2245 | hlist_for_each_entry(_sp, \ | 2251 | hlist_for_each_entry(_sp, \ |
| 2246 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ | 2252 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ |
| 2247 | if ((_sp)->role.invalid) { \ | 2253 | if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \ |
| 2248 | } else | 2254 | } else |
| 2249 | 2255 | ||
| 2250 | #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ | 2256 | #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ |
| @@ -2301,6 +2307,11 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } | |||
| 2301 | static void mmu_audit_disable(void) { } | 2307 | static void mmu_audit_disable(void) { } |
| 2302 | #endif | 2308 | #endif |
| 2303 | 2309 | ||
| 2310 | static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
| 2311 | { | ||
| 2312 | return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); | ||
| 2313 | } | ||
| 2314 | |||
| 2304 | static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 2315 | static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
| 2305 | struct list_head *invalid_list) | 2316 | struct list_head *invalid_list) |
| 2306 | { | 2317 | { |
| @@ -2525,6 +2536,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
| 2525 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) | 2536 | if (level > PT_PAGE_TABLE_LEVEL && need_sync) |
| 2526 | flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); | 2537 | flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); |
| 2527 | } | 2538 | } |
| 2539 | sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; | ||
| 2528 | clear_page(sp->spt); | 2540 | clear_page(sp->spt); |
| 2529 | trace_kvm_mmu_get_page(sp, true); | 2541 | trace_kvm_mmu_get_page(sp, true); |
| 2530 | 2542 | ||
| @@ -4233,6 +4245,13 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, | |||
| 4233 | return false; | 4245 | return false; |
| 4234 | 4246 | ||
| 4235 | if (cached_root_available(vcpu, new_cr3, new_role)) { | 4247 | if (cached_root_available(vcpu, new_cr3, new_role)) { |
| 4248 | /* | ||
| 4249 | * It is possible that the cached previous root page is | ||
| 4250 | * obsolete because of a change in the MMU generation | ||
| 4251 | * number. However, changing the generation number is | ||
| 4252 | * accompanied by KVM_REQ_MMU_RELOAD, which will free | ||
| 4253 | * the root set here and allocate a new one. | ||
| 4254 | */ | ||
| 4236 | kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); | 4255 | kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); |
| 4237 | if (!skip_tlb_flush) { | 4256 | if (!skip_tlb_flush) { |
| 4238 | kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); | 4257 | kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); |
| @@ -5649,11 +5668,89 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) | |||
| 5649 | return alloc_mmu_pages(vcpu); | 5668 | return alloc_mmu_pages(vcpu); |
| 5650 | } | 5669 | } |
| 5651 | 5670 | ||
| 5671 | |||
| 5672 | static void kvm_zap_obsolete_pages(struct kvm *kvm) | ||
| 5673 | { | ||
| 5674 | struct kvm_mmu_page *sp, *node; | ||
| 5675 | LIST_HEAD(invalid_list); | ||
| 5676 | int ign; | ||
| 5677 | |||
| 5678 | restart: | ||
| 5679 | list_for_each_entry_safe_reverse(sp, node, | ||
| 5680 | &kvm->arch.active_mmu_pages, link) { | ||
| 5681 | /* | ||
| 5682 | * No obsolete valid page exists before a newly created page | ||
| 5683 | * since active_mmu_pages is a FIFO list. | ||
| 5684 | */ | ||
| 5685 | if (!is_obsolete_sp(kvm, sp)) | ||
| 5686 | break; | ||
| 5687 | |||
| 5688 | /* | ||
| 5689 | * Do not repeatedly zap a root page to avoid unnecessary | ||
| 5690 | * KVM_REQ_MMU_RELOAD, otherwise we may not be able to | ||
| 5691 | * progress: | ||
| 5692 | * vcpu 0 vcpu 1 | ||
| 5693 | * call vcpu_enter_guest(): | ||
| 5694 | * 1): handle KVM_REQ_MMU_RELOAD | ||
| 5695 | * and require mmu-lock to | ||
| 5696 | * load mmu | ||
| 5697 | * repeat: | ||
| 5698 | * 1): zap root page and | ||
| 5699 | * send KVM_REQ_MMU_RELOAD | ||
| 5700 | * | ||
| 5701 | * 2): if (cond_resched_lock(mmu-lock)) | ||
| 5702 | * | ||
| 5703 | * 2): hold mmu-lock and load mmu | ||
| 5704 | * | ||
| 5705 | * 3): see KVM_REQ_MMU_RELOAD bit | ||
| 5706 | * on vcpu->requests is set | ||
| 5707 | * then return 1 to call | ||
| 5708 | * vcpu_enter_guest() again. | ||
| 5709 | * goto repeat; | ||
| 5710 | * | ||
| 5711 | * Since we are reversely walking the list and the invalid | ||
| 5712 | * list will be moved to the head, skip the invalid page | ||
| 5713 | * can help us to avoid the infinity list walking. | ||
| 5714 | */ | ||
| 5715 | if (sp->role.invalid) | ||
| 5716 | continue; | ||
| 5717 | |||
| 5718 | if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { | ||
| 5719 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
| 5720 | cond_resched_lock(&kvm->mmu_lock); | ||
| 5721 | goto restart; | ||
| 5722 | } | ||
| 5723 | |||
| 5724 | if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) | ||
| 5725 | goto restart; | ||
| 5726 | } | ||
| 5727 | |||
| 5728 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | ||
| 5729 | } | ||
| 5730 | |||
| 5731 | /* | ||
| 5732 | * Fast invalidate all shadow pages and use lock-break technique | ||
| 5733 | * to zap obsolete pages. | ||
| 5734 | * | ||
| 5735 | * It's required when memslot is being deleted or VM is being | ||
| 5736 | * destroyed, in these cases, we should ensure that KVM MMU does | ||
| 5737 | * not use any resource of the being-deleted slot or all slots | ||
| 5738 | * after calling the function. | ||
| 5739 | */ | ||
| 5740 | static void kvm_mmu_zap_all_fast(struct kvm *kvm) | ||
| 5741 | { | ||
| 5742 | spin_lock(&kvm->mmu_lock); | ||
| 5743 | kvm->arch.mmu_valid_gen++; | ||
| 5744 | |||
| 5745 | kvm_zap_obsolete_pages(kvm); | ||
| 5746 | spin_unlock(&kvm->mmu_lock); | ||
| 5747 | } | ||
| 5748 | |||
| 5652 | static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, | 5749 | static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, |
| 5653 | struct kvm_memory_slot *slot, | 5750 | struct kvm_memory_slot *slot, |
| 5654 | struct kvm_page_track_notifier_node *node) | 5751 | struct kvm_page_track_notifier_node *node) |
| 5655 | { | 5752 | { |
| 5656 | kvm_mmu_zap_all(kvm); | 5753 | kvm_mmu_zap_all_fast(kvm); |
| 5657 | } | 5754 | } |
| 5658 | 5755 | ||
| 5659 | void kvm_mmu_init_vm(struct kvm *kvm) | 5756 | void kvm_mmu_init_vm(struct kvm *kvm) |
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ced9fba32598..a3cba321b5c5 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c | |||
| @@ -4540,6 +4540,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) | |||
| 4540 | int len; | 4540 | int len; |
| 4541 | gva_t gva = 0; | 4541 | gva_t gva = 0; |
| 4542 | struct vmcs12 *vmcs12; | 4542 | struct vmcs12 *vmcs12; |
| 4543 | struct x86_exception e; | ||
| 4543 | short offset; | 4544 | short offset; |
| 4544 | 4545 | ||
| 4545 | if (!nested_vmx_check_permission(vcpu)) | 4546 | if (!nested_vmx_check_permission(vcpu)) |
| @@ -4588,7 +4589,8 @@ static int handle_vmread(struct kvm_vcpu *vcpu) | |||
| 4588 | vmx_instruction_info, true, len, &gva)) | 4589 | vmx_instruction_info, true, len, &gva)) |
| 4589 | return 1; | 4590 | return 1; |
| 4590 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ | 4591 | /* _system ok, nested_vmx_check_permission has verified cpl=0 */ |
| 4591 | kvm_write_guest_virt_system(vcpu, gva, &field_value, len, NULL); | 4592 | if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) |
| 4593 | kvm_inject_page_fault(vcpu, &e); | ||
| 4592 | } | 4594 | } |
| 4593 | 4595 | ||
| 4594 | return nested_vmx_succeed(vcpu); | 4596 | return nested_vmx_succeed(vcpu); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 290c3c3efb87..91602d310a3f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
| @@ -5312,6 +5312,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, | |||
| 5312 | /* kvm_write_guest_virt_system can pull in tons of pages. */ | 5312 | /* kvm_write_guest_virt_system can pull in tons of pages. */ |
| 5313 | vcpu->arch.l1tf_flush_l1d = true; | 5313 | vcpu->arch.l1tf_flush_l1d = true; |
| 5314 | 5314 | ||
| 5315 | /* | ||
| 5316 | * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED | ||
| 5317 | * is returned, but our callers are not ready for that and they blindly | ||
| 5318 | * call kvm_inject_page_fault. Ensure that they at least do not leak | ||
| 5319 | * uninitialized kernel stack memory into cr2 and error code. | ||
| 5320 | */ | ||
| 5321 | memset(exception, 0, sizeof(*exception)); | ||
| 5315 | return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, | 5322 | return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, |
| 5316 | PFERR_WRITE_MASK, exception); | 5323 | PFERR_WRITE_MASK, exception); |
| 5317 | } | 5324 | } |
