diff options
author | Avi Kivity <avi@qumranet.com> | 2007-09-16 12:58:32 -0400 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-01-30 10:52:48 -0500 |
commit | c7addb902054195b995114df154e061c7d604f69 (patch) | |
tree | 985910a6c970957126c91e55c55b0e73ae877e0c /drivers/kvm/paging_tmpl.h | |
parent | 51c6cf662b4b361a09fbd324f4c67875d9bcfbea (diff) |
KVM: Allow not-present guest page faults to bypass kvm
There are two classes of page faults trapped by kvm:
- host page faults, where the fault is needed to allow kvm to install
the shadow pte or update the guest accessed and dirty bits
- guest page faults, where the guest has faulted and kvm simply injects
the fault back into the guest to handle
The second class, guest page faults, is pure overhead. We can eliminate
some of it on vmx using the following evil trick:
- when we set up a shadow page table entry, if the corresponding guest pte
is not present, set up the shadow pte as not present
- if the guest pte _is_ present, mark the shadow pte as present but also
set one of the reserved bits in the shadow pte
- tell the vmx hardware not to trap faults which have the present bit clear
With this, normal page-not-present faults go directly to the guest,
bypassing kvm entirely.
Unfortunately, this trick only works on Intel hardware, as AMD lacks a
way to discriminate among page faults based on error code. It is also
a little risky since it uses reserved bits which might become unreserved
in the future, so a module parameter is provided to disable it.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers/kvm/paging_tmpl.h')
-rw-r--r-- | drivers/kvm/paging_tmpl.h | 52 |
1 files changed, 39 insertions, 13 deletions
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h index 6b094b44f8fb..99ac9b15f773 100644 --- a/drivers/kvm/paging_tmpl.h +++ b/drivers/kvm/paging_tmpl.h | |||
@@ -31,6 +31,7 @@ | |||
31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) | 31 | #define PT_INDEX(addr, level) PT64_INDEX(addr, level) |
32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 32 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) | 33 | #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) |
34 | #define PT_LEVEL_BITS PT64_LEVEL_BITS | ||
34 | #ifdef CONFIG_X86_64 | 35 | #ifdef CONFIG_X86_64 |
35 | #define PT_MAX_FULL_LEVELS 4 | 36 | #define PT_MAX_FULL_LEVELS 4 |
36 | #else | 37 | #else |
@@ -45,6 +46,7 @@ | |||
45 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) | 46 | #define PT_INDEX(addr, level) PT32_INDEX(addr, level) |
46 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) | 47 | #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) |
47 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) | 48 | #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) |
49 | #define PT_LEVEL_BITS PT32_LEVEL_BITS | ||
48 | #define PT_MAX_FULL_LEVELS 2 | 50 | #define PT_MAX_FULL_LEVELS 2 |
49 | #else | 51 | #else |
50 | #error Invalid PTTYPE value | 52 | #error Invalid PTTYPE value |
@@ -211,12 +213,12 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | |||
211 | { | 213 | { |
212 | hpa_t paddr; | 214 | hpa_t paddr; |
213 | int dirty = gpte & PT_DIRTY_MASK; | 215 | int dirty = gpte & PT_DIRTY_MASK; |
214 | u64 spte = *shadow_pte; | 216 | u64 spte; |
215 | int was_rmapped = is_rmap_pte(spte); | 217 | int was_rmapped = is_rmap_pte(*shadow_pte); |
216 | 218 | ||
217 | pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" | 219 | pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" |
218 | " user_fault %d gfn %lx\n", | 220 | " user_fault %d gfn %lx\n", |
219 | __FUNCTION__, spte, (u64)gpte, access_bits, | 221 | __FUNCTION__, *shadow_pte, (u64)gpte, access_bits, |
220 | write_fault, user_fault, gfn); | 222 | write_fault, user_fault, gfn); |
221 | 223 | ||
222 | if (write_fault && !dirty) { | 224 | if (write_fault && !dirty) { |
@@ -236,7 +238,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | |||
236 | FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); | 238 | FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); |
237 | } | 239 | } |
238 | 240 | ||
239 | spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; | 241 | spte = PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; |
240 | spte |= gpte & PT64_NX_MASK; | 242 | spte |= gpte & PT64_NX_MASK; |
241 | if (!dirty) | 243 | if (!dirty) |
242 | access_bits &= ~PT_WRITABLE_MASK; | 244 | access_bits &= ~PT_WRITABLE_MASK; |
@@ -248,10 +250,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, | |||
248 | spte |= PT_USER_MASK; | 250 | spte |= PT_USER_MASK; |
249 | 251 | ||
250 | if (is_error_hpa(paddr)) { | 252 | if (is_error_hpa(paddr)) { |
251 | spte |= gaddr; | 253 | set_shadow_pte(shadow_pte, |
252 | spte |= PT_SHADOW_IO_MARK; | 254 | shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); |
253 | spte &= ~PT_PRESENT_MASK; | ||
254 | set_shadow_pte(shadow_pte, spte); | ||
255 | return; | 255 | return; |
256 | } | 256 | } |
257 | 257 | ||
@@ -286,6 +286,7 @@ unshadowed: | |||
286 | if (access_bits & PT_WRITABLE_MASK) | 286 | if (access_bits & PT_WRITABLE_MASK) |
287 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); | 287 | mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); |
288 | 288 | ||
289 | pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); | ||
289 | set_shadow_pte(shadow_pte, spte); | 290 | set_shadow_pte(shadow_pte, spte); |
290 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); | 291 | page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); |
291 | if (!was_rmapped) | 292 | if (!was_rmapped) |
@@ -304,14 +305,18 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte, | |||
304 | } | 305 | } |
305 | 306 | ||
306 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, | 307 | static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, |
307 | u64 *spte, const void *pte, int bytes) | 308 | u64 *spte, const void *pte, int bytes, |
309 | int offset_in_pte) | ||
308 | { | 310 | { |
309 | pt_element_t gpte; | 311 | pt_element_t gpte; |
310 | 312 | ||
311 | if (bytes < sizeof(pt_element_t)) | ||
312 | return; | ||
313 | gpte = *(const pt_element_t *)pte; | 313 | gpte = *(const pt_element_t *)pte; |
314 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) | 314 | if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { |
315 | if (!offset_in_pte && !is_present_pte(gpte)) | ||
316 | set_shadow_pte(spte, shadow_notrap_nonpresent_pte); | ||
317 | return; | ||
318 | } | ||
319 | if (bytes < sizeof(pt_element_t)) | ||
315 | return; | 320 | return; |
316 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); | 321 | pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); |
317 | FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, | 322 | FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, |
@@ -368,7 +373,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | |||
368 | unsigned hugepage_access = 0; | 373 | unsigned hugepage_access = 0; |
369 | 374 | ||
370 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; | 375 | shadow_ent = ((u64 *)__va(shadow_addr)) + index; |
371 | if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { | 376 | if (is_shadow_present_pte(*shadow_ent)) { |
372 | if (level == PT_PAGE_TABLE_LEVEL) | 377 | if (level == PT_PAGE_TABLE_LEVEL) |
373 | break; | 378 | break; |
374 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; | 379 | shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; |
@@ -500,6 +505,26 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
500 | return gpa; | 505 | return gpa; |
501 | } | 506 | } |
502 | 507 | ||
508 | static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, | ||
509 | struct kvm_mmu_page *sp) | ||
510 | { | ||
511 | int i; | ||
512 | pt_element_t *gpt; | ||
513 | |||
514 | if (sp->role.metaphysical || PTTYPE == 32) { | ||
515 | nonpaging_prefetch_page(vcpu, sp); | ||
516 | return; | ||
517 | } | ||
518 | |||
519 | gpt = kmap_atomic(gfn_to_page(vcpu->kvm, sp->gfn), KM_USER0); | ||
520 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
521 | if (is_present_pte(gpt[i])) | ||
522 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
523 | else | ||
524 | sp->spt[i] = shadow_notrap_nonpresent_pte; | ||
525 | kunmap_atomic(gpt, KM_USER0); | ||
526 | } | ||
527 | |||
503 | #undef pt_element_t | 528 | #undef pt_element_t |
504 | #undef guest_walker | 529 | #undef guest_walker |
505 | #undef FNAME | 530 | #undef FNAME |
@@ -508,4 +533,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) | |||
508 | #undef SHADOW_PT_INDEX | 533 | #undef SHADOW_PT_INDEX |
509 | #undef PT_LEVEL_MASK | 534 | #undef PT_LEVEL_MASK |
510 | #undef PT_DIR_BASE_ADDR_MASK | 535 | #undef PT_DIR_BASE_ADDR_MASK |
536 | #undef PT_LEVEL_BITS | ||
511 | #undef PT_MAX_FULL_LEVELS | 537 | #undef PT_MAX_FULL_LEVELS |