aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/paging_tmpl.h
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-09-16 12:58:32 -0400
committerAvi Kivity <avi@qumranet.com>2008-01-30 10:52:48 -0500
commitc7addb902054195b995114df154e061c7d604f69 (patch)
tree985910a6c970957126c91e55c55b0e73ae877e0c /drivers/kvm/paging_tmpl.h
parent51c6cf662b4b361a09fbd324f4c67875d9bcfbea (diff)
KVM: Allow not-present guest page faults to bypass kvm
There are two classes of page faults trapped by kvm: - host page faults, where the fault is needed to allow kvm to install the shadow pte or update the guest accessed and dirty bits - guest page faults, where the guest has faulted and kvm simply injects the fault back into the guest to handle The second class, guest page faults, is pure overhead. We can eliminate some of it on vmx using the following evil trick: - when we set up a shadow page table entry, if the corresponding guest pte is not present, set up the shadow pte as not present - if the guest pte _is_ present, mark the shadow pte as present but also set one of the reserved bits in the shadow pte - tell the vmx hardware not to trap faults which have the present bit clear With this, normal page-not-present faults go directly to the guest, bypassing kvm entirely. Unfortunately, this trick only works on Intel hardware, as AMD lacks a way to discriminate among page faults based on error code. It is also a little risky since it uses reserved bits which might become unreserved in the future, so a module parameter is provided to disable it. Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers/kvm/paging_tmpl.h')
-rw-r--r--drivers/kvm/paging_tmpl.h52
1 files changed, 39 insertions, 13 deletions
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 6b094b44f8fb..99ac9b15f773 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -31,6 +31,7 @@
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
34 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
36 #else 37 #else
@@ -45,6 +46,7 @@
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 46 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
49 #define PT_LEVEL_BITS PT32_LEVEL_BITS
48 #define PT_MAX_FULL_LEVELS 2 50 #define PT_MAX_FULL_LEVELS 2
49#else 51#else
50 #error Invalid PTTYPE value 52 #error Invalid PTTYPE value
@@ -211,12 +213,12 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
211{ 213{
212 hpa_t paddr; 214 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK; 215 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte; 216 u64 spte;
215 int was_rmapped = is_rmap_pte(spte); 217 int was_rmapped = is_rmap_pte(*shadow_pte);
216 218
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" 219 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n", 220 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits, 221 __FUNCTION__, *shadow_pte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn); 222 write_fault, user_fault, gfn);
221 223
222 if (write_fault && !dirty) { 224 if (write_fault && !dirty) {
@@ -236,7 +238,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); 238 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 } 239 }
238 240
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; 241 spte = PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK; 242 spte |= gpte & PT64_NX_MASK;
241 if (!dirty) 243 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK; 244 access_bits &= ~PT_WRITABLE_MASK;
@@ -248,10 +250,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
248 spte |= PT_USER_MASK; 250 spte |= PT_USER_MASK;
249 251
250 if (is_error_hpa(paddr)) { 252 if (is_error_hpa(paddr)) {
251 spte |= gaddr; 253 set_shadow_pte(shadow_pte,
252 spte |= PT_SHADOW_IO_MARK; 254 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return; 255 return;
256 } 256 }
257 257
@@ -286,6 +286,7 @@ unshadowed:
286 if (access_bits & PT_WRITABLE_MASK) 286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); 287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288 288
289 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
289 set_shadow_pte(shadow_pte, spte); 290 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); 291 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped) 292 if (!was_rmapped)
@@ -304,14 +305,18 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
304} 305}
305 306
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 307static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes) 308 u64 *spte, const void *pte, int bytes,
309 int offset_in_pte)
308{ 310{
309 pt_element_t gpte; 311 pt_element_t gpte;
310 312
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte; 313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) 314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
315 if (!offset_in_pte && !is_present_pte(gpte))
316 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
317 return;
318 }
319 if (bytes < sizeof(pt_element_t))
315 return; 320 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); 321 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, 322 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
@@ -368,7 +373,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
368 unsigned hugepage_access = 0; 373 unsigned hugepage_access = 0;
369 374
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index; 375 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 376 if (is_shadow_present_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL) 377 if (level == PT_PAGE_TABLE_LEVEL)
373 break; 378 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 379 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
@@ -500,6 +505,26 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
500 return gpa; 505 return gpa;
501} 506}
502 507
508static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
509 struct kvm_mmu_page *sp)
510{
511 int i;
512 pt_element_t *gpt;
513
514 if (sp->role.metaphysical || PTTYPE == 32) {
515 nonpaging_prefetch_page(vcpu, sp);
516 return;
517 }
518
519 gpt = kmap_atomic(gfn_to_page(vcpu->kvm, sp->gfn), KM_USER0);
520 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
521 if (is_present_pte(gpt[i]))
522 sp->spt[i] = shadow_trap_nonpresent_pte;
523 else
524 sp->spt[i] = shadow_notrap_nonpresent_pte;
525 kunmap_atomic(gpt, KM_USER0);
526}
527
503#undef pt_element_t 528#undef pt_element_t
504#undef guest_walker 529#undef guest_walker
505#undef FNAME 530#undef FNAME
@@ -508,4 +533,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
508#undef SHADOW_PT_INDEX 533#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK 534#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK 535#undef PT_DIR_BASE_ADDR_MASK
536#undef PT_LEVEL_BITS
511#undef PT_MAX_FULL_LEVELS 537#undef PT_MAX_FULL_LEVELS