From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 31 May 2010 14:28:19 +0800 Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages In common cases, guest SRAO MCE will cause corresponding poisoned page be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay the MCE to guest OS. But it is reported that if the poisoned page is accessed in guest after unmapping and before MCE is relayed to guest OS, userspace will be killed. The reason is as follows. Because poisoned page has been un-mapped, guest access will cause guest exit and kvm_mmu_page_fault will be called. kvm_mmu_page_fault can not get the poisoned page for fault address, so kernel and user space MMIO processing is tried in turn. In user MMIO processing, poisoned page is accessed again, then userspace is killed by force_sig_info. To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM and do not try kernel and user space MMIO processing for poisoned page. [xiao: fix warning introduced by avi] Reported-by: Max Asbock Signed-off-by: Huang Ying Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..b666d8d106a9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return pt_write; } +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +{ + char buf[1]; + void __user *hva; + int r; + + /* Touch the page, so send SIGBUS */ + hva = (void __user *)gfn_to_hva(kvm, gfn); + r = copy_from_user(buf, hva, 1); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(kvm, gfn); + return 0; + } + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; -- cgit v1.2.2 From 54a4f0239f2e98bc0842818f611a4cf73bb7dd35 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 5 May 2010 09:03:49 +0800 Subject: KVM: MMU: make kvm_mmu_zap_page() return the number of pages it actually freed Currently, kvm_mmu_zap_page() returning the number of freed children sp. This might confuse the caller, because caller don't know the actual freed number. Let's make kvm_mmu_zap_page() return the number of pages it actually freed. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b666d8d106a9..be981b1f1881 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1504,6 +1504,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { + /* Count self */ + ret++; hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { @@ -1540,7 +1542,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); used_pages -= kvm_mmu_zap_page(kvm, page); - used_pages--; } kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; @@ -2941,7 +2942,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page) + 1; + return kvm_mmu_zap_page(kvm, page); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) -- cgit v1.2.2 From 6d77dbfc88e37c9efd5c5dd18445cfe819ae17ea Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 10 May 2010 11:16:56 +0300 Subject: KVM: inject #UD if instruction emulation fails and exit to userspace Do not kill VM when instruction emulation fails. Inject #UD and report failure to userspace instead. Userspace may choose to reenter guest if vcpu is in userspace (cpl == 3) in which case guest OS will kill offending process and continue running. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be981b1f1881..4a02dee1f2b5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2814,11 +2814,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) return 1; case EMULATE_DO_MMIO: ++vcpu->stat.mmio_exits; - return 0; + /* fall through */ case EMULATE_FAIL: - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; return 0; default: BUG(); -- cgit v1.2.2 From f0f5933a1626c8df7b0bfd227819c66320fb4f0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:09:56 +0300 Subject: KVM: MMU: Fix free memory accounting race in mmu_alloc_roots() We drop the mmu lock between freeing memory and allocating the roots; this allows some other vcpu to sneak in and allocate memory. While the race is benign (resulting only in temporary overallocation, not oom) it is simple and easy to fix by moving the freeing close to the allocation. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4a02dee1f2b5..d7aebafffdfe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,6 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2124,6 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu->kvm); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2496,9 +2498,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); -- cgit v1.2.2 From 24955b6c906045382b67f3e6beba7e5df4a4a045 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 12 May 2010 21:00:35 -0300 Subject: KVM: pass correct parameter to kvm_mmu_free_some_pages Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d7aebafffdfe..a455c5eee370 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2094,7 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); @@ -2125,7 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = i << 30; } spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu->kvm); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); -- cgit v1.2.2 From 62ad07551a2ace89e35604d1c55fdae1dd3359a8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 12 May 2010 16:40:41 +0800 Subject: KVM: x86: Clean up duplicate assignment mmu.free() already set root_hpa to INVALID_PAGE, no need to do it again in the destory_kvm_mmu(). kvm_x86_ops->set_cr4() and set_efer() already assign cr4/efer to vcpu->arch.cr4/efer, no need to do it again later. Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a455c5eee370..c075542648cd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2478,10 +2478,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) + /* mmu.free() should set root_hpa = INVALID_PAGE */ vcpu->arch.mmu.free(vcpu); - vcpu->arch.mmu.root_hpa = INVALID_PAGE; - } } int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From e8ad9a707496c163312bcdd6aa3b90603d45dc9b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:06:02 +0800 Subject: KVM: MMU: use proper cache object freeing function Use kmem_cache_free to free objects allocated by kmem_cache_alloc. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c075542648cd..bb48b0ca5f8c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -305,10 +305,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, return 0; } -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, + struct kmem_cache *cache) { while (mc->nobjs) - kfree(mc->objects[--mc->nobjs]); + kmem_cache_free(cache, mc->objects[--mc->nobjs]); } static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, @@ -356,10 +357,11 @@ out: static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, + mmu_page_header_cache); } static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, @@ -380,7 +382,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) static void mmu_free_pte_chain(struct kvm_pte_chain *pc) { - kfree(pc); + kmem_cache_free(pte_chain_cache, pc); } static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) @@ -391,7 +393,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) { - kfree(rd); + kmem_cache_free(rmap_desc_cache, rd); } /* @@ -898,7 +900,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) list_del(&sp->link); __free_page(virt_to_page(sp->spt)); __free_page(virt_to_page(sp->gfns)); - kfree(sp); + kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } -- cgit v1.2.2 From 6d74229f013ed8e4a00d74cfa7a3fa6a2315c467 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 13 May 2010 10:07:00 +0800 Subject: KVM: MMU: remove rmap before clear spte Remove rmap before clear spte otherwise it will trigger BUG_ON() in some functions such as rmap_write_protect(). Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bb48b0ca5f8c..5c9d6df0113e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1813,6 +1813,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; + rmap_remove(vcpu->kvm, sptep); spte = shadow_trap_nonpresent_pte; goto set_pte; } -- cgit v1.2.2 From 1d9dc7e000915b9607b480e34fcb4238b789fbb1 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:51:24 +0800 Subject: KVM: MMU: split kvm_sync_page() function Split kvm_sync_page() into kvm_sync_page() and kvm_sync_page_transient() to clarify the code address Avi's suggestion kvm_sync_page_transient() function only update shadow page but not mark it sync and not write protect sp->gfn. it will be used by later patch Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5c9d6df0113e..ef5d140a2705 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1199,16 +1199,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); - kvm_unlink_unsync_page(vcpu->kvm, sp); + if (clear_unsync) { + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); + kvm_unlink_unsync_page(vcpu->kvm, sp); + } + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; @@ -1218,6 +1222,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } +static void mmu_convert_notrap(struct kvm_mmu_page *sp); +static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int ret; + + ret = __kvm_sync_page(vcpu, sp, false); + if (!ret) + mmu_convert_notrap(sp); + return ret; +} + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + return __kvm_sync_page(vcpu, sp, true); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; -- cgit v1.2.2 From e02aa901b1aa41fb541521800cc2a4774c162485 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 15 May 2010 18:52:34 +0800 Subject: KVM: MMU: don't write-protect if have new mapping to unsync page Two cases maybe happen in kvm_mmu_get_page() function: - one case is, the goal sp is already in cache, if the sp is unsync, we only need update it to assure this mapping is valid, but not mark it sync and not write-protect sp->gfn since it not broke unsync rule(one shadow page for a gfn) - another case is, the goal sp not existed, we need create a new sp for gfn, i.e, gfn (may)has another shadow page, to keep unsync rule, we should sync(mark sync and write-protect) gfn's unsync shadow page. After enabling multiple unsync shadows, we sync those shadow pages only when the new sp not allow to become unsync(also for the unsyc rule, the new rule is: allow all pte page become unsync) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ef5d140a2705..064ddfbde108 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1337,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp; + struct kvm_mmu_page *sp, *unsync_sp = NULL; struct hlist_node *node, *tmp; role = vcpu->arch.mmu.base_role; @@ -1356,20 +1356,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { if (sp->unsync) - if (kvm_sync_page(vcpu, sp)) - continue; + unsync_sp = sp; if (sp->role.word != role.word) continue; + if (!direct && unsync_sp && + kvm_sync_page_transient(vcpu, unsync_sp)) { + unsync_sp = NULL; + break; + } + mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); kvm_mmu_mark_parents_unsync(sp); - } + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); + trace_kvm_mmu_get_page(sp, false); return sp; } + if (!direct && unsync_sp) + kvm_sync_page(vcpu, unsync_sp); + ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) -- cgit v1.2.2 From 221d059d15f1c8bd070a63fd45cd8d2598af5f99 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 23 May 2010 18:37:00 +0300 Subject: KVM: Update Red Hat copyrights Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 064ddfbde108..25d3bb2543e2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,6 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affilates. * * Authors: * Yaniv Kamay -- cgit v1.2.2 From 9cf5cf5ad43b293581e5b87678ea5783c06d1a41 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:40:07 +0800 Subject: KVM: MMU: allow more page become unsync at gfn mapping time In current code, shadow page can become asynchronous only if one shadow page for a gfn, this rule is too strict, in fact, we can let all last mapping page(i.e, it's the pte page) become unsync, and sync them at invlpg or flush tlb time. This patch allow more page become asynchronous at gfn mapping time Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 25d3bb2543e2..ba119dae890e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1170,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, return __mmu_unsync_walk(sp, pvec); } -static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) -{ - unsigned index; - struct hlist_head *bucket; - struct kvm_mmu_page *sp; - struct hlist_node *node; - - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: found role %x\n", - __func__, sp->role.word); - return sp; - } - return NULL; -} - static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); @@ -1759,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); -static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + trace_kvm_mmu_unsync_page(sp); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + + kvm_mmu_mark_parents_unsync(sp); + mmu_convert_notrap(sp); +} + +static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - unsigned index; struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; + unsigned index; - index = kvm_page_table_hashfn(sp->gfn); + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != sp->gfn || s->role.direct) + if (s->gfn != gfn || s->role.direct || s->unsync || + s->role.invalid) continue; - if (s->role.word != sp->role.word) - return 1; + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + __kvm_unsync_page(vcpu, s); } - trace_kvm_mmu_unsync_page(sp); - ++vcpu->kvm->stat.mmu_unsync; - sp->unsync = 1; - - kvm_mmu_mark_parents_unsync(sp); - - mmu_convert_notrap(sp); - return 0; } static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - struct kvm_mmu_page *shadow; + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + bool need_unsync = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || s->role.direct || s->role.invalid) + continue; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { - if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (shadow->unsync) - return 0; - if (can_unsync && oos_shadow) - return kvm_unsync_page(vcpu, shadow); - return 1; + + if (!need_unsync && !s->unsync) { + if (!can_unsync || !oos_shadow) + return 1; + need_unsync = true; + } } + if (need_unsync) + kvm_unsync_pages(vcpu, gfn); return 0; } -- cgit v1.2.2 From 9f1a122f970dbef5ba3496587f39df5c1853083f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 24 May 2010 15:41:33 +0800 Subject: KVM: MMU: allow more page become unsync at getting sp time Allow more page become asynchronous at getting sp time, if need create new shadow page for gfn but it not allow unsync(level > 1), we should unsync all gfn's unsync page Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba119dae890e..07673487fd5d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1220,6 +1220,35 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return __kvm_sync_page(vcpu, sp, true); } +/* @gfn should be write-protected at the call site */ +static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + unsigned index; + bool flush = false; + + index = kvm_page_table_hashfn(gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != gfn || !s->unsync || s->role.invalid) + continue; + + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + if ((s->role.cr4_pae != !!is_pae(vcpu)) || + (vcpu->arch.mmu.sync_page(vcpu, s))) { + kvm_mmu_zap_page(vcpu->kvm, s); + continue; + } + kvm_unlink_unsync_page(vcpu->kvm, s); + flush = true; + } + + if (flush) + kvm_mmu_flush_tlb(vcpu); +} + struct mmu_page_path { struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; unsigned int idx[PT64_ROOT_LEVEL-1]; @@ -1318,8 +1347,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned index; unsigned quadrant; struct hlist_head *bucket; - struct kvm_mmu_page *sp, *unsync_sp = NULL; + struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; + bool need_sync = false; role = vcpu->arch.mmu.base_role; role.level = level; @@ -1336,17 +1366,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) if (sp->gfn == gfn) { - if (sp->unsync) - unsync_sp = sp; + if (!need_sync && sp->unsync) + need_sync = true; if (sp->role.word != role.word) continue; - if (!direct && unsync_sp && - kvm_sync_page_transient(vcpu, unsync_sp)) { - unsync_sp = NULL; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) break; - } mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { @@ -1358,9 +1385,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, trace_kvm_mmu_get_page(sp, false); return sp; } - if (!direct && unsync_sp) - kvm_sync_page(vcpu, unsync_sp); - ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) @@ -1371,6 +1395,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); + if (level > PT_PAGE_TABLE_LEVEL && need_sync) + kvm_sync_pages(vcpu, gfn); + account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) -- cgit v1.2.2 From 2032a93d66fa282ba0f2ea9152eeff9511fa9a96 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:49:59 +0800 Subject: KVM: MMU: Don't allocate gfns page for direct mmu pages When sp->role.direct is set, sp->gfns does not contain any essential information, leaf sptes reachable from this sp are for a continuous guest physical memory range (a linear range). So sp->gfns[i] (if it was set) equals to sp->gfn + i. (PT_PAGE_TABLE_LEVEL) Obviously, it is not essential information, we can calculate it when need. It means we don't need sp->gfns when sp->role.direct=1, Thus we can save one page usage for every kvm_mmu_page. Note: Access to sp->gfns must be wrapped by kvm_mmu_page_get_gfn() or kvm_mmu_page_set_gfn(). It is only exposed in FNAME(sync_page). Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 07673487fd5d..f46b6c9aff27 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -397,6 +397,22 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) kmem_cache_free(rmap_desc_cache, rd); } +static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) +{ + if (!sp->role.direct) + return sp->gfns[index]; + + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); +} + +static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) +{ + if (sp->role.direct) + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); + else + sp->gfns[index] = gfn; +} + /* * Return the pointer to the largepage write count for a given * gfn, handling slots that are not large page aligned. @@ -547,7 +563,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return count; gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); - sp->gfns[spte - sp->spt] = gfn; + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); @@ -605,6 +621,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; pfn_t pfn; + gfn_t gfn; unsigned long *rmapp; int i; @@ -616,7 +633,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(*spte)) kvm_set_pfn_dirty(pfn); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -900,7 +918,8 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); - __free_page(virt_to_page(sp->gfns)); + if (!sp->role.direct) + __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); ++kvm->arch.n_free_mmu_pages; } @@ -911,13 +930,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn) } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, - u64 *parent_pte) + u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + if (!direct) + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, + PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); @@ -1386,7 +1407,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } ++vcpu->kvm->stat.mmu_cache_miss; - sp = kvm_mmu_alloc_page(vcpu, parent_pte); + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; @@ -3403,7 +3424,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) if (*sptep & PT_WRITABLE_MASK) { rev_sp = page_header(__pa(sptep)); - gfn = rev_sp->gfns[sptep - rev_sp->spt]; + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { if (!printk_ratelimit()) @@ -3417,8 +3438,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - rev_sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; -- cgit v1.2.2 From c9fa0b3bef9a0b117b3c3f958ec553c21f609a9f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 26 May 2010 16:48:25 +0800 Subject: KVM: MMU: Calculate correct base gfn for direct non-DIR level In Document/kvm/mmu.txt: gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. But in __direct_map(), the base gfn calculation is incorrect, it does not calculate correctly when level=3 or 4. Fix by using PT64_LVL_ADDR_MASK() which accounts for all levels correctly. Reported-by: Marcelo Tosatti Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f46b6c9aff27..c0350be52c91 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2020,7 +2020,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, } if (*iterator.sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + u64 base_addr = iterator.addr; + + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); + pseudo_gfn = base_addr >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, iterator.level - 1, 1, ACC_ALL, iterator.sptep); -- cgit v1.2.2 From 01c168ac3d6568fed0373d82bd2db2b9339aab16 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Thu, 27 May 2010 16:09:48 +0800 Subject: KVM: MMU: don't check PT_WRITABLE_MASK directly Since we have is_writable_pte(), make use of it. Signed-off-by: Gui Jianfeng Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c0350be52c91..9f4be0114bce 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2990,7 +2990,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ - if (pt[i] & PT_WRITABLE_MASK) + if (is_writable_pte(pt[i])) pt[i] &= ~PT_WRITABLE_MASK; } kvm_flush_remote_tlbs(kvm); @@ -3425,7 +3425,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_mmu_page *rev_sp; gfn_t gfn; - if (*sptep & PT_WRITABLE_MASK) { + if (is_writable_pte(*sptep)) { rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); @@ -3474,7 +3474,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) if (!(ent & PT_PRESENT_MASK)) continue; - if (!(ent & PT_WRITABLE_MASK)) + if (!is_writable_pte(ent)) continue; inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } @@ -3508,7 +3508,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) spte = rmap_next(vcpu->kvm, rmapp, NULL); while (spte) { - if (*spte & PT_WRITABLE_MASK) + if (is_writable_pte(*spte)) printk(KERN_ERR "%s: (%s) shadow page has " "writable mappings: gfn %lx role %x\n", __func__, audit_msg, sp->gfn, -- cgit v1.2.2 From 8184dd38e22fcaec664c2b98c382b85c26780e26 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 27 May 2010 14:22:51 +0300 Subject: KVM: MMU: Allow spte.w=1 for gpte.w=0 and cr0.wp=0 only in shadow mode When tdp is enabled, the guest's cr0.wp shouldn't have any effect on spte permissions. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9f4be0114bce..69d40a6e1e68 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1882,7 +1882,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { + || (!tdp_enabled && write_fault && !is_write_protection(vcpu) + && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { -- cgit v1.2.2 From b66d80006e415ee083e59c9429911eab78047f8f Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 31 May 2010 17:11:39 +0800 Subject: KVM: MMU: Don't calculate quadrant if tdp_enabled There's no need to calculate quadrant if tdp is enabled. Signed-off-by: Gui Jianfeng Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 69d40a6e1e68..d3cd102aee26 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; -- cgit v1.2.2 From 03116aa57e75b1bbe8b5e04f3cd21cdb6588c4ba Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:52:17 +0800 Subject: KVM: MMU: skip invalid sp when unprotect page In kvm_mmu_unprotect_page(), the invalid sp can be skipped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d3cd102aee26..3ac51153bc47 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1629,7 +1629,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) bucket = &kvm->arch.mmu_page_hash[index]; restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct) { + if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; -- cgit v1.2.2 From 7ae680eb2d5f0cb10ca0e6d1ff5ecb145befe8e4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:07 +0800 Subject: KVM: MMU: introduce some macros to cleanup hlist traverseing Introduce for_each_gfn_sp() and for_each_gfn_indirect_valid_sp() to cleanup hlist traverseing Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 122 +++++++++++++++++++++-------------------------------- 1 file changed, 47 insertions(+), 75 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ac51153bc47..881ad918455c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1201,6 +1201,17 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn)) {} else + +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ + hlist_for_each_entry_safe(sp, pos, n, \ + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ + (sp)->role.invalid) {} else + static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, bool clear_unsync) { @@ -1244,16 +1255,12 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) /* @gfn should be write-protected at the call site */ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; bool flush = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || !s->unsync || s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (!s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1365,9 +1372,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, u64 *parent_pte) { union kvm_mmu_page_role role; - unsigned index; unsigned quadrant; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *tmp; bool need_sync = false; @@ -1383,36 +1388,34 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) - if (sp->gfn == gfn) { - if (!need_sync && sp->unsync) - need_sync = true; + for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + if (!need_sync && sp->unsync) + need_sync = true; - if (sp->role.word != role.word) - continue; + if (sp->role.word != role.word) + continue; - if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) - break; + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) + break; - mmu_page_add_parent_pte(vcpu, sp, parent_pte); - if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(sp); - } else if (sp->unsync) - kvm_mmu_mark_parents_unsync(sp); + mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(sp); + } else if (sp->unsync) + kvm_mmu_mark_parents_unsync(sp); - trace_kvm_mmu_get_page(sp, false); - return sp; - } + trace_kvm_mmu_get_page(sp, false); + return sp; + } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); if (!sp) return sp; sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, bucket); + hlist_add_head(&sp->hash_link, + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { if (rmap_write_protect(vcpu->kvm, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); @@ -1617,46 +1620,34 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *n; int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, - sp->role.word); - r = 1; - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + sp->role.word); + r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; + } return r; } static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *sp; struct hlist_node *node, *nn; - index = kvm_page_table_hashfn(gfn); - bucket = &kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { - if (sp->gfn == gfn && !sp->role.direct - && !sp->role.invalid) { - pgprintk("%s: zap %lx %x\n", - __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) - goto restart; - } + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + pgprintk("%s: zap %lx %x\n", + __func__, gfn, sp->role.word); + if (kvm_mmu_zap_page(kvm, sp)) + goto restart; } } @@ -1799,17 +1790,11 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; - unsigned index; - - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->unsync || - s->role.invalid) + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); __kvm_unsync_page(vcpu, s); @@ -1819,18 +1804,11 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { - unsigned index; - struct hlist_head *bucket; struct kvm_mmu_page *s; struct hlist_node *node, *n; bool need_unsync = false; - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { - if (s->gfn != gfn || s->role.direct || s->role.invalid) - continue; - + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2703,8 +2681,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; - struct hlist_head *bucket; - unsigned index; u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2772,13 +2748,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pte_updated = NULL; } } - index = kvm_page_table_hashfn(gfn); - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; restart: - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) - continue; + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; -- cgit v1.2.2 From 7775834a233478ec855b97e30727248f12eafe76 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:53:54 +0800 Subject: KVM: MMU: split the operations of kvm_mmu_zap_page() Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to split kvm_mmu_zap_page() function, then we can: - traverse hlist safely - easily to gather remote tlb flush which occurs during page zapped Those feature can be used in the later patches Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 881ad918455c..9b849a70742d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -916,6 +916,7 @@ static int is_empty_shadow_page(u64 *spt) static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); + hlist_del(&sp->hash_link); list_del(&sp->link); __free_page(virt_to_page(sp->spt)); if (!sp->role.direct) @@ -1200,6 +1201,10 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list); +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list); #define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ hlist_for_each_entry_safe(sp, pos, n, \ @@ -1530,7 +1535,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } static int mmu_zap_unsync_children(struct kvm *kvm, - struct kvm_mmu_page *parent) + struct kvm_mmu_page *parent, + struct list_head *invalid_list) { int i, zapped = 0; struct mmu_page_path parents; @@ -1544,7 +1550,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp; for_each_sp(pages, sp, parents, i) { - kvm_mmu_zap_page(kvm, sp); + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); mmu_pages_clear_parents(&parents); zapped++; } @@ -1554,16 +1560,16 @@ static int mmu_zap_unsync_children(struct kvm *kvm, return zapped; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { int ret; - trace_kvm_mmu_zap_page(sp); + trace_kvm_mmu_prepare_zap_page(sp); ++kvm->stat.mmu_shadow_zapped; - ret = mmu_zap_unsync_children(kvm, sp); + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); - kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); if (sp->unsync) @@ -1571,17 +1577,45 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->root_count) { /* Count self */ ret++; - hlist_del(&sp->hash_link); - kvm_mmu_free_page(kvm, sp); + list_move(&sp->link, invalid_list); } else { - sp->role.invalid = 1; list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); } + + sp->role.invalid = 1; kvm_mmu_reset_last_pte_updated(kvm); return ret; } +static void kvm_mmu_commit_zap_page(struct kvm *kvm, + struct list_head *invalid_list) +{ + struct kvm_mmu_page *sp; + + if (list_empty(invalid_list)) + return; + + kvm_flush_remote_tlbs(kvm); + + do { + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); + WARN_ON(!sp->role.invalid || sp->root_count); + kvm_mmu_free_page(kvm, sp); + } while (!list_empty(invalid_list)); + +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + LIST_HEAD(invalid_list); + int ret; + + ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); + return ret; +} + /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock -- cgit v1.2.2 From 103ad25a86a6ec5418b3dca6a0d2bf2ba01a8318 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:54:38 +0800 Subject: KVM: MMU: don't get free page number in the loop In the later patch, we will modify sp's zapping way like below: kvm_mmu_prepare_zap_page A kvm_mmu_prepare_zap_page B kvm_mmu_prepare_zap_page C .... kvm_mmu_commit_zap_page [ zaped multiple sps only need to call kvm_mmu_commit_zap_page once ] In __kvm_mmu_free_some_pages() function, the free page number is getted form 'vcpu->kvm->arch.n_free_mmu_pages' in loop, it will hinders us to apply kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() since kvm_mmu_prepare_zap_page() not free sp. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b849a70742d..1aad8e713f78 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2863,13 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + int free_pages; + + free_pages = vcpu->kvm->arch.n_free_mmu_pages; + while (free_pages < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); ++vcpu->kvm->stat.mmu_recycled; } } -- cgit v1.2.2 From d98ba053656c033180781007241f2c9d54606d56 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:55:29 +0800 Subject: KVM: MMU: gather remote tlb flush which occurs during page zapped Using kvm_mmu_prepare_zap_page() and kvm_mmu_zap_page() instead of kvm_mmu_zap_page() that can reduce remote tlb flush IPI Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 31 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1aad8e713f78..44548e346976 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1200,7 +1200,6 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) --kvm->stat.mmu_unsync; } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, struct list_head *invalid_list); static void kvm_mmu_commit_zap_page(struct kvm *kvm, @@ -1218,10 +1217,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, (sp)->role.invalid) {} else static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - bool clear_unsync) + struct list_head *invalid_list, bool clear_unsync) { if (sp->role.cr4_pae != !!is_pae(vcpu)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1232,7 +1231,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1244,17 +1243,22 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { + LIST_HEAD(invalid_list); int ret; - ret = __kvm_sync_page(vcpu, sp, false); + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); if (!ret) mmu_convert_notrap(sp); + else + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + return ret; } -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + struct list_head *invalid_list) { - return __kvm_sync_page(vcpu, sp, true); + return __kvm_sync_page(vcpu, sp, invalid_list, true); } /* @gfn should be write-protected at the call site */ @@ -1262,6 +1266,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); bool flush = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { @@ -1271,13 +1276,14 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || (vcpu->arch.mmu.sync_page(vcpu, s))) { - kvm_mmu_zap_page(vcpu->kvm, s); + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } kvm_unlink_unsync_page(vcpu->kvm, s); flush = true; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); if (flush) kvm_mmu_flush_tlb(vcpu); } @@ -1348,6 +1354,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp; struct mmu_page_path parents; struct kvm_mmu_pages pages; + LIST_HEAD(invalid_list); kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { @@ -1360,9 +1367,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_flush_remote_tlbs(vcpu->kvm); for_each_sp(pages, sp, parents, i) { - kvm_sync_page(vcpu, sp); + kvm_sync_page(vcpu, sp, &invalid_list); mmu_pages_clear_parents(&parents); } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); cond_resched_lock(&vcpu->kvm->mmu_lock); kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1606,16 +1614,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } -static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - LIST_HEAD(invalid_list); - int ret; - - ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); - kvm_mmu_commit_zap_page(kvm, &invalid_list); - return ret; -} - /* * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock @@ -1623,6 +1621,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) { int used_pages; + LIST_HEAD(invalid_list); used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; used_pages = max(0, used_pages); @@ -1640,8 +1639,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_zap_page(kvm, page); + used_pages -= kvm_mmu_prepare_zap_page(kvm, page, + &invalid_list); } + kvm_mmu_commit_zap_page(kvm, &invalid_list); kvm_nr_mmu_pages = used_pages; kvm->arch.n_free_mmu_pages = 0; } @@ -1656,6 +1657,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); @@ -1665,9 +1667,10 @@ restart: pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; } @@ -1675,14 +1678,16 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; struct hlist_node *node, *nn; + LIST_HEAD(invalid_list); restart: for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; } + kvm_mmu_commit_zap_page(kvm, &invalid_list); } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) @@ -2123,6 +2128,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + LIST_HEAD(invalid_list); if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; @@ -2132,8 +2138,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; - if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + if (!sp->root_count && sp->role.invalid) { + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + } vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -2146,10 +2154,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) - kvm_mmu_zap_page(vcpu->kvm, sp); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = INVALID_PAGE; } @@ -2715,6 +2725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; struct hlist_node *node, *n; + LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); @@ -2801,7 +2812,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_zap_page(vcpu->kvm, sp)) + if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list)) goto restart; ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2836,6 +2848,7 @@ restart: ++spte; } } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { @@ -2864,6 +2877,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { int free_pages; + LIST_HEAD(invalid_list); free_pages = vcpu->kvm->arch.n_free_mmu_pages; while (free_pages < KVM_REFILL_PAGES && @@ -2872,9 +2886,11 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_zap_page(vcpu->kvm, sp); + free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -3009,25 +3025,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) void kvm_mmu_zap_all(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; + LIST_HEAD(invalid_list); spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_zap_page(kvm, sp)) + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) goto restart; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, + struct list_head *invalid_list) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - return kvm_mmu_zap_page(kvm, page); + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) @@ -3040,6 +3059,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages, idx, freed_pages; + LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -3047,12 +3067,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, + &invalid_list); cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; + kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); } -- cgit v1.2.2 From f41d335a02d5132c14ec0459d3b2790eeb16fb11 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:11 +0800 Subject: KVM: MMU: traverse sp hlish safely Now, we can safely to traverse sp hlish Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 51 +++++++++++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 28 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 44548e346976..3b75689eda95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1205,13 +1205,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n) \ - hlist_for_each_entry_safe(sp, pos, n, \ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ + hlist_for_each_entry(sp, pos, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else @@ -1265,11 +1265,11 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (!s->unsync) continue; @@ -1387,7 +1387,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node, *tmp; + struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1401,7 +1401,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) { + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { if (!need_sync && sp->unsync) need_sync = true; @@ -1656,19 +1656,18 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) { + + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); return r; @@ -1677,15 +1676,13 @@ restart: static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node, *nn; + struct hlist_node *node; LIST_HEAD(invalid_list); -restart: - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } kvm_mmu_commit_zap_page(kvm, &invalid_list); } @@ -1830,9 +1827,9 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -1844,10 +1841,10 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node, *n; + struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; @@ -2724,7 +2721,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; - struct hlist_node *node, *n; + struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry; u64 *spte; @@ -2794,8 +2791,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } -restart: - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; @@ -2812,9 +2808,8 @@ restart: */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list)) - goto restart; + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; } -- cgit v1.2.2 From 0671a8e75d8aeb33e15c5152147abb0d2fa0c1e6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Jun 2010 21:56:59 +0800 Subject: KVM: MMU: reduce remote tlb flush in kvm_mmu_pte_write() collect remote tlb flush in kvm_mmu_pte_write() path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b75689eda95..b285449e82b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2666,11 +2666,15 @@ static bool need_remote_flush(u64 old, u64 new) return (old & ~new & PT64_PERM_MASK) != 0; } -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) +static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, + bool remote_flush, bool local_flush) { - if (need_remote_flush(old, new)) + if (zap_page) + return; + + if (remote_flush) kvm_flush_remote_tlbs(vcpu->kvm); - else + else if (local_flush) kvm_mmu_flush_tlb(vcpu); } @@ -2735,6 +2739,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int npte; int r; int invlpg_counter; + bool remote_flush, local_flush, zap_page; + + zap_page = remote_flush = local_flush = false; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -2808,7 +2815,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_prepare_zap_page(vcpu->kvm, sp, + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; continue; @@ -2833,16 +2840,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (quadrant != sp->role.quadrant) continue; } + local_flush = true; spte = &sp->spt[page_offset / sizeof(*spte)]; while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); if (gentry) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - mmu_pte_write_flush_tlb(vcpu, entry, *spte); + if (!remote_flush && need_remote_flush(entry, *spte)) + remote_flush = true; ++spte; } } + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); -- cgit v1.2.2 From 4f78fd08e91c52f097d64a42d903b76fe52a3a0f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:05 +0800 Subject: KVM: MMU: remove unnecessary remote tlb flush This remote tlb flush is no necessary since we have synced while sp is zapped Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b285449e82b0..098a0b8616b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3040,8 +3040,6 @@ restart: kvm_mmu_commit_zap_page(kvm, &invalid_list); spin_unlock(&kvm->mmu_lock); - - kvm_flush_remote_tlbs(kvm); } static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, -- cgit v1.2.2 From 5304efde6ae27deeeae79b97af709d4ceecc336e Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:05:57 +0800 Subject: KVM: MMU: use wrapper function to flush local tlb Use kvm_mmu_flush_tlb() function instead of calling kvm_x86_ops->tlb_flush(vcpu) directly. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 098a0b8616b0..e087f855461d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1984,7 +1984,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, reset_host_protection)) { if (write_fault) *ptwrite = 1; - kvm_x86_ops->tlb_flush(vcpu); + kvm_mmu_flush_tlb(vcpu); } pgprintk("%s: setting spte %llx\n", __func__, *sptep); -- cgit v1.2.2 From 3b5d13218667b3ca52efa52cec1d322163bf5465 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 8 Jun 2010 20:07:01 +0800 Subject: KVM: MMU: delay local tlb flush delay local tlb flush until enter guest moden, it can reduce vpid flush frequency and reduce remote tlb flush IPI(if KVM_REQ_TLB_FLUSH bit is already set, IPI is not sent) Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e087f855461d..4706a936e36f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2364,7 +2364,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); } static void paging_new_cr3(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 2390218b6aa2eb3784b0a82fa811c19097dc793a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jun 2010 17:02:16 +0300 Subject: KVM: Fix mov cr3 #GP at wrong instruction On Intel, we call skip_emulated_instruction() even if we injected a #GP, resulting in the #GP pointing at the wrong address. Fix by injecting the exception and skipping the instruction at the same place, so we can do just one or the other. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 4706a936e36f..aa98fca03ed7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3203,7 +3203,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } -- cgit v1.2.2 From f918b443527e98476c8cc45683152106b9e4bedc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:30:36 +0800 Subject: KVM: MMU: avoid double write protected in sync page path The sync page is already write protected in mmu_sync_children(), don't write protected it again Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa98fca03ed7..ff333572be75 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1216,6 +1216,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else +/* @sp->gfn should be write-protected at the call site */ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, struct list_head *invalid_list, bool clear_unsync) { @@ -1224,11 +1225,8 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 1; } - if (clear_unsync) { - if (rmap_write_protect(vcpu->kvm, sp->gfn)) - kvm_flush_remote_tlbs(vcpu->kvm); + if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - } if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); -- cgit v1.2.2 From be71e061d15c0aad4f8c2606f76c57b8a19792fd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:31:38 +0800 Subject: KVM: MMU: don't mark pte notrap if it's just sync transient If the sync-sp just sync transient, don't mark its pte notrap Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff333572be75..d1e09f3c5614 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1103,7 +1103,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) + struct kvm_mmu_page *sp, bool clear_unsync) { return 1; } @@ -1228,7 +1228,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1237,7 +1237,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return 0; } -static void mmu_convert_notrap(struct kvm_mmu_page *sp); static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -1245,9 +1244,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, int ret; ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); - if (!ret) - mmu_convert_notrap(sp); - else + if (ret) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); return ret; @@ -1273,7 +1270,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s))) { + (vcpu->arch.mmu.sync_page(vcpu, s, true))) { kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } -- cgit v1.2.2 From ebdea638df04ae6293a9a5414d98ad843c69e82f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:32:34 +0800 Subject: KVM: MMU: cleanup for __mmu_unsync_walk() Decrease sp->unsync_children after clear unsync_child_bitmap bit Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d1e09f3c5614..41e801b53064 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1160,9 +1160,11 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, return -ENOSPC; ret = __mmu_unsync_walk(child, pvec); - if (!ret) + if (!ret) { __clear_bit(i, sp->unsync_child_bitmap); - else if (ret > 0) + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); + } else if (ret > 0) nr_unsync_leaf += ret; else return ret; @@ -1176,8 +1178,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, } } - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) - sp->unsync_children = 0; return nr_unsync_leaf; } -- cgit v1.2.2 From 7a8f1a74e4193d21e55b35928197486f2c047efb Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:34:04 +0800 Subject: KVM: MMU: clear unsync_child_bitmap completely In current code, some page's unsync_child_bitmap is not cleared completely in mmu_sync_children(), for example, if two PDPEs shard one PDT, one of PDPE's unsync_child_bitmap is not cleared. Currently, it not harm anything just little overload, but it's the prepare work for the later patch Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 41e801b53064..ab12be4eb105 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1149,33 +1149,38 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { + struct kvm_mmu_page *child; u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - - if (child->unsync_children) { - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - - ret = __mmu_unsync_walk(child, pvec); - if (!ret) { - __clear_bit(i, sp->unsync_child_bitmap); - sp->unsync_children--; - WARN_ON((int)sp->unsync_children < 0); - } else if (ret > 0) - nr_unsync_leaf += ret; - else - return ret; - } + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) + goto clear_child_bitmap; + + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + goto clear_child_bitmap; + else if (ret > 0) + nr_unsync_leaf += ret; + else + return ret; + } else if (child->unsync) { + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + } else + goto clear_child_bitmap; - if (child->unsync) { - nr_unsync_leaf++; - if (mmu_pages_add(pvec, child, i)) - return -ENOSPC; - } - } + continue; + +clear_child_bitmap: + __clear_bit(i, sp->unsync_child_bitmap); + sp->unsync_children--; + WARN_ON((int)sp->unsync_children < 0); } -- cgit v1.2.2 From 1047df1fb682a41eb9885d6b3f2d04d6c8fd3756 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 11 Jun 2010 21:35:15 +0800 Subject: KVM: MMU: don't walk every parent pages while mark unsync While we mark the parent's unsync_child_bitmap, if the parent is already unsynced, it no need walk it's parent, it can reduce some unnecessary workload Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 61 +++++++++++++++--------------------------------------- 1 file changed, 17 insertions(+), 44 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ab12be4eb105..8c2f580956d9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -175,7 +175,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) -typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); +typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -1024,7 +1024,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } - static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; @@ -1034,63 +1033,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + fn(parent_sp, sp->parent_pte); return; } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) + u64 *spte = pte_chain->parent_ptes[i]; + + if (!spte) break; - parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(parent_sp); - mmu_parent_walk(parent_sp, fn); + parent_sp = page_header(__pa(spte)); + fn(parent_sp, spte); } } -static void kvm_mmu_update_unsync_bitmap(u64 *spte) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - unsigned int index; - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - index = spte - sp->spt; - if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) - sp->unsync_children++; - WARN_ON(!sp->unsync_children); + mmu_parent_walk(sp, mark_unsync); } -static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) { - struct kvm_pte_chain *pte_chain; - struct hlist_node *node; - int i; + unsigned int index; - if (!sp->parent_pte) + index = spte - sp->spt; + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; - - if (!sp->multimapped) { - kvm_mmu_update_unsync_bitmap(sp->parent_pte); + if (sp->unsync_children++) return; - } - - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { - if (!pte_chain->parent_ptes[i]) - break; - kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); - } -} - -static int unsync_walk_fn(struct kvm_mmu_page *sp) -{ - kvm_mmu_update_parents_unsync(sp); - return 1; -} - -static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) -{ - mmu_parent_walk(sp, unsync_walk_fn); - kvm_mmu_update_parents_unsync(sp); + kvm_mmu_mark_parents_unsync(sp); } static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, -- cgit v1.2.2 From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 21 Jun 2010 11:44:20 +0300 Subject: KVM: Remove memory alias support As advertised in feature-removal-schedule.txt. Equivalent support is provided by overlapping memory regions. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8c2f580956d9..c5501bc10106 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int *write_count; int i; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { write_count = slot_largepage_idx(gfn, slot, i); @@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm, struct kvm_memory_slot *slot; int *largepage_idx; - gfn = unalias_gfn(kvm, gfn); - slot = gfn_to_memslot_unaliased(kvm, gfn); + slot = gfn_to_memslot(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot, level); return *largepage_idx; @@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) /* * Take gfn and return the reverse mapping to it. - * Note: gfn must be unaliased before this function get called */ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) @@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) if (!is_rmap_spte(*spte)) return count; - gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); @@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) u64 *spte; int i, write_protected = 0; - gfn = unalias_gfn(kvm, gfn); rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); spte = rmap_next(kvm, rmapp, NULL); @@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - gfn = unalias_gfn(vcpu->kvm, gfn); rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); @@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->unsync) continue; - gfn = unalias_gfn(vcpu->kvm, sp->gfn); - slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; spte = rmap_next(vcpu->kvm, rmapp, NULL); -- cgit v1.2.2 From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 10 May 2010 12:34:53 +0300 Subject: KVM: Add mini-API for vcpu->requests Makes it a little more readable and hackable. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c5501bc10106..690a7fc58c17 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_mmu_mark_parents_unsync(sp); } else if (sp->unsync) kvm_mmu_mark_parents_unsync(sp); @@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) int ret = 0; if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } static void paging_new_cr3(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 36a2e6774bfb5f32a0f23bb155f1f960321f291b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 30 Jun 2010 16:02:02 +0800 Subject: KVM: MMU: fix writable sync sp mapping While we sync many unsync sp at one time(in mmu_sync_children()), we may mapping the spte writable, it's dangerous, if one unsync sp's mapping gfn is another unsync page's gfn. For example: SP1.pte[0] = P SP2.gfn's pfn = P [SP1.pte[0] = SP2.gfn's pfn] First, we write protected SP1 and SP2, but SP1 and SP2 are still the unsync sp. Then, sync SP1 first, it will detect SP1.pte[0].gfn only has one unsync-sp, that is SP2, so it will mapping it writable, but we plan to sync SP2 soon, at this point, the SP2->unsync is not reliable since later we sync SP2 but SP2->gfn is already writable. So the final result is: SP2 is the sync page but SP2.gfn is writable. This bug will corrupt guest's page table, fixed by mark read-only mapping if the mapped gfn has shadow pages. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 690a7fc58c17..ca07ed083b59 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1810,11 +1810,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool need_unsync = false; for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + if (!can_unsync) + return 1; + if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; if (!need_unsync && !s->unsync) { - if (!can_unsync || !oos_shadow) + if (!oos_shadow) return 1; need_unsync = true; } -- cgit v1.2.2 From 828554136bbacae6e39fc31b9cd7e7c660ad7530 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 1 Jul 2010 16:00:11 +0200 Subject: KVM: Remove unnecessary divide operations This patch converts unnecessary divide and modulo operations in the KVM large page related code into logical operations. This allows to convert gfn_t to u64 while not breaking 32 bit builds. Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ca07ed083b59..a20fd613acfe 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -423,8 +423,8 @@ static int *slot_largepage_idx(gfn_t gfn, { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].write_count; } @@ -528,8 +528,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); return &slot->lpage_info[level - 2][idx].rmap_pde; } -- cgit v1.2.2 From dd180b3e90253cb4ca95d603a8c17413f8daec69 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Sat, 3 Jul 2010 16:02:42 +0800 Subject: KVM: VMX: fix tlb flush with invalid root Commit 341d9b535b6c simplify reload logic while entry guest mode, it can avoid unnecessary sync-root if KVM_REQ_MMU_RELOAD and KVM_REQ_MMU_SYNC both set. But, it cause a issue that when we handle 'KVM_REQ_TLB_FLUSH', the root is invalid, it is triggered during my test: Kernel BUG at ffffffffa00212b8 [verbose debug info unavailable] ...... Fixed by directly return if the root is not ready. Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a20fd613acfe..70cdf6876b5f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -92,8 +92,6 @@ module_param(oos_shadow, bool, 0644); #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define VALID_PAGE(x) ((x) != INVALID_PAGE) - #define PT64_LEVEL_BITS 9 #define PT64_LEVEL_SHIFT(level) \ -- cgit v1.2.2 From be38d276b0189fa86231fc311428622a1981ad62 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:31:27 +0300 Subject: KVM: MMU: Introduce drop_spte() When we call rmap_remove(), we (almost) always immediately follow it by an __set_spte() to a nonpresent pte. Since we need to perform the two operations atomically, to avoid losing the dirty and accessed bits, introduce a helper drop_spte() and convert all call sites. The operation is still nonatomic at this point. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 70cdf6876b5f..1ad39cf70e18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + rmap_remove(kvm, sptep); + __set_spte(sptep, new_spte); +} + static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; @@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); if (is_writable_pte(*spte)) { - rmap_remove(kvm, spte); + drop_spte(kvm, spte, + shadow_trap_nonpresent_pte); --kvm->stat.lpages; - __set_spte(spte, shadow_trap_nonpresent_pte); spte = NULL; write_protected = 1; } @@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, while ((spte = rmap_next(kvm, rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); need_tlb_flush = 1; } return need_tlb_flush; @@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { - rmap_remove(kvm, spte); - __set_spte(spte, shadow_trap_nonpresent_pte); + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); spte = rmap_next(kvm, rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); @@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } else { if (is_large_pte(ent)) --kvm->stat.lpages; - rmap_remove(kvm, &pt[i]); + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); } } pt[i] = shadow_trap_nonpresent_pte; @@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; - rmap_remove(vcpu->kvm, sptep); - spte = shadow_trap_nonpresent_pte; - goto set_pte; + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + goto done; } spte |= PT_WRITABLE_MASK; @@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: __set_spte(sptep, spte); +done: return ret; } @@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); - rmap_remove(vcpu->kvm, sptep); - __set_spte(sptep, shadow_trap_nonpresent_pte); + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else was_rmapped = 1; @@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) - rmap_remove(vcpu->kvm, spte); + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, spte); -- cgit v1.2.2 From ce061867aa2877605cda96fa8ec7dff15f70a983 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:38:12 +0300 Subject: KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to drop_spte() Since we need to make the check atomic, move it to the place that will set the new spte. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ad39cf70e18..fbdca08b8d8c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - pfn_t pfn; gfn_t gfn; unsigned long *rmapp; int i; - if (!is_rmap_spte(*spte)) - return; sp = page_header(__pa(spte)); - pfn = spte_to_pfn(*spte); - if (*spte & shadow_accessed_mask) - kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(pfn); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { @@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { + pfn_t pfn; + + if (!is_rmap_spte(*sptep)) { + __set_spte(sptep, new_spte); + return; + } + pfn = spte_to_pfn(*sptep); + if (*sptep & shadow_accessed_mask) + kvm_set_pfn_accessed(pfn); + if (is_writable_pte(*sptep)) + kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); __set_spte(sptep, new_spte); } -- cgit v1.2.2 From a9221dd5ec125fbec1702fae016c6d2ea1a9a3da Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 14:48:06 +0300 Subject: KVM: MMU: Atomically check for accessed bit when dropping an spte Currently, in the window between the check for the accessed bit, and actually dropping the spte, a vcpu can access the page through the spte and set the bit, which will be ignored by the mmu. Fix by using an exchange operation to atmoically fetch the spte and drop it. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fbdca08b8d8c..ba2efcf2b86e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte) #endif } +static u64 __xchg_spte(u64 *sptep, u64 new_spte) +{ +#ifdef CONFIG_X86_64 + return xchg(sptep, new_spte); +#else + u64 old_spte; + + do { + old_spte = *sptep; + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); + + return old_spte; +#endif +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { pfn_t pfn; + u64 old_spte; - if (!is_rmap_spte(*sptep)) { - __set_spte(sptep, new_spte); + old_spte = __xchg_spte(sptep, new_spte); + if (!is_rmap_spte(old_spte)) return; - } - pfn = spte_to_pfn(*sptep); - if (*sptep & shadow_accessed_mask) + pfn = spte_to_pfn(old_spte); + if (old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(*sptep)) + if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); rmap_remove(kvm, sptep); - __set_spte(sptep, new_spte); } static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) -- cgit v1.2.2 From b79b93f92cb3b66b89d75525fdfd2454b1e1f446 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jun 2010 15:46:44 +0300 Subject: KVM: MMU: Don't drop accessed bit while updating an spte __set_spte() will happily replace an spte with the accessed bit set with one that has the accessed bit clear. Add a helper update_spte() which checks for this condition and updates the page flag if needed. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ba2efcf2b86e..d8d48329cb82 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static void update_spte(u64 *sptep, u64 new_spte) +{ + u64 old_spte; + + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + __set_spte(sptep, new_spte); + } else { + old_spte = __xchg_spte(sptep, new_spte); + if (old_spte & shadow_accessed_mask) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + } +} + static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, struct kmem_cache *base_cache, int min) { @@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); if (is_writable_pte(*spte)) { - __set_spte(spte, *spte & ~PT_WRITABLE_MASK); + update_spte(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } spte = rmap_next(kvm, rmapp, spte); @@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte; + u64 *spte, new_spte, old_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; + new_spte &= ~shadow_accessed_mask; if (is_writable_pte(*spte)) kvm_set_pfn_dirty(spte_to_pfn(*spte)); - __set_spte(spte, new_spte); + old_spte = __xchg_spte(spte, new_spte); + if (is_shadow_present_pte(old_spte) + && (old_spte & shadow_accessed_mask)) + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } } @@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - __set_spte(sptep, spte); + update_spte(sptep, spte); done: return ret; } -- cgit v1.2.2 From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 7 Jul 2010 20:16:45 +0300 Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area Currently if guest access address that belongs to memory slot but is not backed up by page or page is read only KVM treats it like MMIO access. Remove that capability. It was never part of the interface and should not be relied upon. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d8d48329cb82..89d7a2cae53b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) if (is_hwpoison_pfn(pfn)) { kvm_send_hwpoison_signal(kvm, gfn); return 0; - } + } else if (is_fault_pfn(pfn)) + return -EFAULT; + return 1; } -- cgit v1.2.2 From 32ef26a3598636be520abed90ed0c2f439d36bbe Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:04 +0300 Subject: KVM: MMU: Add link_shadow_page() helper To simplify the process of fetching an spte, add a helper that links a shadow page to an spte. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 89d7a2cae53b..df3a7a79cce3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1482,6 +1482,16 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator->level; } +static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) +{ + u64 spte; + + spte = __pa(sp->spt) + | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.2 From 121eee97a7802acda8b78436cc53196e9885549f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:05 +0300 Subject: KVM: MMU: Use __set_spte to link shadow pages To avoid split accesses to 64 bit sptes on i386, use __set_spte() to link shadow pages together. (not technically required since shadow pages are __GFP_KERNEL, so upper 32 bits are always clear) Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index df3a7a79cce3..5a6019a534a3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1489,7 +1489,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; + __set_spte(sptep, spte); } static void kvm_mmu_page_unlink_children(struct kvm *kvm, -- cgit v1.2.2 From a3aa51cfaafe9179add88db20506ccb07e030b47 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:06 +0300 Subject: KVM: MMU: Add drop_large_spte() helper To clarify spte fetching code, move large spte handling into a helper. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5a6019a534a3..b75d6cb44ab6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1492,6 +1492,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) __set_spte(sptep, spte); } +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.2 From a357bd229cdaf37a41798d238ab50b34c71dd0d6 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 Jul 2010 14:27:07 +0300 Subject: KVM: MMU: Add validate_direct_spte() helper Add a helper to verify that a direct shadow page is valid wrt the required access permissions; drop the page if it is not valid. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b75d6cb44ab6..36c62f33513f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1500,6 +1500,29 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) } } +static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, + unsigned direct_access) +{ + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + return; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.2 From 6e3e243c3b6e0bbd18c6ce0fbc12bc3fe2d77b34 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 16 Jul 2010 11:52:55 +0200 Subject: KVM: MMU: fix mmu notifier invalidate handler for huge spte The index wasn't calculated correctly (off by one) for huge spte so KVM guest was unstable with transparent hugepages. Signed-off-by: Andrea Arcangeli Reviewed-by: Reviewed-by: Rik van Riel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 36c62f33513f..812770cddc8d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -850,8 +850,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int sh; + + sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); + idx = ((memslot->base_gfn+gfn_offset) >> sh) - + (memslot->base_gfn >> sh); ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); -- cgit v1.2.2 From fa1de2bfc0feb7245328ad25fb3e6d5cd2c903b4 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:19:51 +0800 Subject: KVM: MMU: add missing reserved bits check in speculative path In the speculative path, we should check guest pte's reserved bits just as the real processor does Reported-by: Marcelo Tosatti Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 812770cddc8d..d2ea9cabc066 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2697,6 +2697,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } + if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + return; + ++vcpu->kvm->stat.mmu_pte_updated; if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); @@ -2775,6 +2778,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; + union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); @@ -2849,6 +2853,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } } + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); @@ -2896,7 +2901,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (gentry) + if (gentry && + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) + & mask.word)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; -- cgit v1.2.2 From daa3db693ce925a14b7e17ab6f306dc0e6a5342c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:23:04 +0800 Subject: KVM: MMU: fix broken page accessed tracking with ept enabled In current code, if ept is enabled(shadow_accessed_mask = 0), the page accessed tracking is lost. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d2ea9cabc066..9b3b916ebeae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -687,7 +687,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); - if (old_spte & shadow_accessed_mask) + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); @@ -815,7 +815,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, kvm_set_pfn_dirty(spte_to_pfn(*spte)); old_spte = __xchg_spte(spte, new_spte); if (is_shadow_present_pte(old_spte) - && (old_spte & shadow_accessed_mask)) + && (!shadow_accessed_mask || + old_spte & shadow_accessed_mask)) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); spte = rmap_next(kvm, rmapp, spte); } -- cgit v1.2.2 From 9ed5520dd3c9cb79c25f95fce9c57b87637d0fb7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:25:17 +0800 Subject: KVM: MMU: fix page dirty tracking lost while sync page In sync-page path, if spte.writable is changed, it will lose page dirty tracking, for example: assume spte.writable = 0 in a unsync-page, when it's synced, it map spte to writable(that is spte.writable = 1), later guest write spte.gfn, it means spte.gfn is dirty, then guest changed this mapping to read-only, after it's synced, spte.writable = 0 So, when host release the spte, it detect spte.writable = 0 and not mark page dirty Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9b3b916ebeae..a04756a26fe2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1985,6 +1985,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: + if (is_writable_pte(*sptep) && !is_writable_pte(spte)) + kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; @@ -1998,7 +2000,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, bool reset_host_protection) { int was_rmapped = 0; - int was_writable = is_writable_pte(*sptep); int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" @@ -2048,15 +2049,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); - kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) rmap_recycle(vcpu, sptep, gfn); - } else { - if (was_writable) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); } + kvm_release_pfn_clean(pfn); if (speculative) { vcpu->arch.last_pte_updated = sptep; vcpu->arch.last_pte_gfn = gfn; -- cgit v1.2.2 From be233d49ea8c1fde9f4afec378dc2c2f16ab0263 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:27:10 +0800 Subject: KVM: MMU: don't atomicly set spte if it's not present If the old mapping is not present, the spte.a is not lost, so no need atomic operation to set it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a04756a26fe2..9c7fae08291d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -307,9 +307,10 @@ static void update_spte(u64 *sptep, u64 new_spte) { u64 old_spte; - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) { + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || + !is_rmap_spte(*sptep)) __set_spte(sptep, new_spte); - } else { + else { old_spte = __xchg_spte(sptep, new_spte); if (old_spte & shadow_accessed_mask) mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); -- cgit v1.2.2 From e4b502ead259fcf70839414abb7c8cdc3b523f01 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:28:09 +0800 Subject: KVM: MMU: cleanup spte set and accssed/dirty tracking Introduce set_spte_track_bits() to cleanup current code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c7fae08291d..e4b862eb8885 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -679,7 +679,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; u64 old_spte; @@ -692,6 +692,11 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) kvm_set_pfn_accessed(pfn); if (is_writable_pte(old_spte)) kvm_set_pfn_dirty(pfn); +} + +static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) +{ + set_spte_track_bits(sptep, new_spte); rmap_remove(kvm, sptep); } @@ -791,7 +796,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { int need_flush = 0; - u64 *spte, new_spte, old_spte; + u64 *spte, new_spte; pte_t *ptep = (pte_t *)data; pfn_t new_pfn; @@ -812,13 +817,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~PT_WRITABLE_MASK; new_spte &= ~SPTE_HOST_WRITEABLE; new_spte &= ~shadow_accessed_mask; - if (is_writable_pte(*spte)) - kvm_set_pfn_dirty(spte_to_pfn(*spte)); - old_spte = __xchg_spte(spte, new_spte); - if (is_shadow_present_pte(old_spte) - && (!shadow_accessed_mask || - old_spte & shadow_accessed_mask)) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); + set_spte_track_bits(spte, new_spte); spte = rmap_next(kvm, rmapp, spte); } } -- cgit v1.2.2 From 9a3aad70572c3f4d55e7f09ac4eb313d41d0a484 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 16 Jul 2010 11:30:18 +0800 Subject: KVM: MMU: using __xchg_spte more smarter Sometimes, atomically set spte is not needed, this patch call __xchg_spte() more smartly Note: if the old mapping's access bit is already set, we no need atomic operation since the access bit is not lost Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e4b862eb8885..0dcc95e09876 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -682,9 +682,14 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static void set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; - u64 old_spte; + u64 old_spte = *sptep; + + if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || + old_spte & shadow_accessed_mask) { + __set_spte(sptep, new_spte); + } else + old_spte = __xchg_spte(sptep, new_spte); - old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; pfn = spte_to_pfn(old_spte); -- cgit v1.2.2