aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/kvm/mmu.c
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-09-16 12:58:32 -0400
committerAvi Kivity <avi@qumranet.com>2008-01-30 10:52:48 -0500
commitc7addb902054195b995114df154e061c7d604f69 (patch)
tree985910a6c970957126c91e55c55b0e73ae877e0c /drivers/kvm/mmu.c
parent51c6cf662b4b361a09fbd324f4c67875d9bcfbea (diff)
KVM: Allow not-present guest page faults to bypass kvm
There are two classes of page faults trapped by kvm: - host page faults, where the fault is needed to allow kvm to install the shadow pte or update the guest accessed and dirty bits - guest page faults, where the guest has faulted and kvm simply injects the fault back into the guest to handle The second class, guest page faults, is pure overhead. We can eliminate some of it on vmx using the following evil trick: - when we set up a shadow page table entry, if the corresponding guest pte is not present, set up the shadow pte as not present - if the guest pte _is_ present, mark the shadow pte as present but also set one of the reserved bits in the shadow pte - tell the vmx hardware not to trap faults which have the present bit clear With this, normal page-not-present faults go directly to the guest, bypassing kvm entirely. Unfortunately, this trick only works on Intel hardware, as AMD lacks a way to discriminate among page faults based on error code. It is also a little risky since it uses reserved bits which might become unreserved in the future, so a module parameter is provided to disable it. Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers/kvm/mmu.c')
-rw-r--r--drivers/kvm/mmu.c89
1 files changed, 68 insertions, 21 deletions
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index feb5ac986c5d..069ce83f018e 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -156,6 +156,16 @@ static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache; 156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache; 157static struct kmem_cache *mmu_page_header_cache;
158 158
159static u64 __read_mostly shadow_trap_nonpresent_pte;
160static u64 __read_mostly shadow_notrap_nonpresent_pte;
161
162void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
163{
164 shadow_trap_nonpresent_pte = trap_pte;
165 shadow_notrap_nonpresent_pte = notrap_pte;
166}
167EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
168
159static int is_write_protection(struct kvm_vcpu *vcpu) 169static int is_write_protection(struct kvm_vcpu *vcpu)
160{ 170{
161 return vcpu->cr0 & X86_CR0_WP; 171 return vcpu->cr0 & X86_CR0_WP;
@@ -176,6 +186,13 @@ static int is_present_pte(unsigned long pte)
176 return pte & PT_PRESENT_MASK; 186 return pte & PT_PRESENT_MASK;
177} 187}
178 188
189static int is_shadow_present_pte(u64 pte)
190{
191 pte &= ~PT_SHADOW_IO_MARK;
192 return pte != shadow_trap_nonpresent_pte
193 && pte != shadow_notrap_nonpresent_pte;
194}
195
179static int is_writeble_pte(unsigned long pte) 196static int is_writeble_pte(unsigned long pte)
180{ 197{
181 return pte & PT_WRITABLE_MASK; 198 return pte & PT_WRITABLE_MASK;
@@ -450,7 +467,7 @@ static int is_empty_shadow_page(u64 *spt)
450 u64 *end; 467 u64 *end;
451 468
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 469 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) { 470 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, 471 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos); 472 pos, *pos);
456 return 0; 473 return 0;
@@ -632,6 +649,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
632 page->gfn = gfn; 649 page->gfn = gfn;
633 page->role = role; 650 page->role = role;
634 hlist_add_head(&page->hash_link, bucket); 651 hlist_add_head(&page->hash_link, bucket);
652 vcpu->mmu.prefetch_page(vcpu, page);
635 if (!metaphysical) 653 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn); 654 rmap_write_protect(vcpu, gfn);
637 return page; 655 return page;
@@ -648,9 +666,9 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
648 666
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) { 667 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 668 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK) 669 if (is_shadow_present_pte(pt[i]))
652 rmap_remove(&pt[i]); 670 rmap_remove(&pt[i]);
653 pt[i] = 0; 671 pt[i] = shadow_trap_nonpresent_pte;
654 } 672 }
655 kvm_flush_remote_tlbs(kvm); 673 kvm_flush_remote_tlbs(kvm);
656 return; 674 return;
@@ -659,8 +677,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 677 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i]; 678 ent = pt[i];
661 679
662 pt[i] = 0; 680 pt[i] = shadow_trap_nonpresent_pte;
663 if (!(ent & PT_PRESENT_MASK)) 681 if (!is_shadow_present_pte(ent))
664 continue; 682 continue;
665 ent &= PT64_BASE_ADDR_MASK; 683 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]); 684 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
@@ -691,7 +709,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm,
691 } 709 }
692 BUG_ON(!parent_pte); 710 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte); 711 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0); 712 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
695 } 713 }
696 kvm_mmu_page_unlink_children(kvm, page); 714 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) { 715 if (!page->root_count) {
@@ -798,7 +816,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
798 816
799 if (level == 1) { 817 if (level == 1) {
800 pte = table[index]; 818 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte)) 819 if (is_shadow_present_pte(pte) && is_writeble_pte(pte))
802 return 0; 820 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); 821 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v); 822 page_header_update_slot(vcpu->kvm, table, v);
@@ -808,7 +826,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
808 return 0; 826 return 0;
809 } 827 }
810 828
811 if (table[index] == 0) { 829 if (table[index] == shadow_trap_nonpresent_pte) {
812 struct kvm_mmu_page *new_table; 830 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn; 831 gfn_t pseudo_gfn;
814 832
@@ -829,6 +847,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
829 } 847 }
830} 848}
831 849
850static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
851 struct kvm_mmu_page *sp)
852{
853 int i;
854
855 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
856 sp->spt[i] = shadow_trap_nonpresent_pte;
857}
858
832static void mmu_free_roots(struct kvm_vcpu *vcpu) 859static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{ 860{
834 int i; 861 int i;
@@ -943,6 +970,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
943 context->page_fault = nonpaging_page_fault; 970 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa; 971 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free; 972 context->free = nonpaging_free;
973 context->prefetch_page = nonpaging_prefetch_page;
946 context->root_level = 0; 974 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL; 975 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE; 976 context->root_hpa = INVALID_PAGE;
@@ -989,6 +1017,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
989 context->new_cr3 = paging_new_cr3; 1017 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault; 1018 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa; 1019 context->gva_to_gpa = paging64_gva_to_gpa;
1020 context->prefetch_page = paging64_prefetch_page;
992 context->free = paging_free; 1021 context->free = paging_free;
993 context->root_level = level; 1022 context->root_level = level;
994 context->shadow_root_level = level; 1023 context->shadow_root_level = level;
@@ -1009,6 +1038,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1009 context->page_fault = paging32_page_fault; 1038 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa; 1039 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free; 1040 context->free = paging_free;
1041 context->prefetch_page = paging32_prefetch_page;
1012 context->root_level = PT32_ROOT_LEVEL; 1042 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL; 1043 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE; 1044 context->root_hpa = INVALID_PAGE;
@@ -1081,7 +1111,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1081 struct kvm_mmu_page *child; 1111 struct kvm_mmu_page *child;
1082 1112
1083 pte = *spte; 1113 pte = *spte;
1084 if (is_present_pte(pte)) { 1114 if (is_shadow_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL) 1115 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte); 1116 rmap_remove(spte);
1087 else { 1117 else {
@@ -1089,22 +1119,25 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1089 mmu_page_remove_parent_pte(child, spte); 1119 mmu_page_remove_parent_pte(child, spte);
1090 } 1120 }
1091 } 1121 }
1092 set_shadow_pte(spte, 0); 1122 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1093 kvm_flush_remote_tlbs(vcpu->kvm); 1123 kvm_flush_remote_tlbs(vcpu->kvm);
1094} 1124}
1095 1125
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 1126static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page, 1127 struct kvm_mmu_page *page,
1098 u64 *spte, 1128 u64 *spte,
1099 const void *new, int bytes) 1129 const void *new, int bytes,
1130 int offset_in_pte)
1100{ 1131{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL) 1132 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return; 1133 return;
1103 1134
1104 if (page->role.glevels == PT32_ROOT_LEVEL) 1135 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes); 1136 paging32_update_pte(vcpu, page, spte, new, bytes,
1137 offset_in_pte);
1106 else 1138 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes); 1139 paging64_update_pte(vcpu, page, spte, new, bytes,
1140 offset_in_pte);
1108} 1141}
1109 1142
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1143void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1126,6 +1159,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1126 int npte; 1159 int npte;
1127 1160
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); 1161 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1162 kvm_mmu_audit(vcpu, "pre pte write");
1129 if (gfn == vcpu->last_pt_write_gfn) { 1163 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count; 1164 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3) 1165 if (vcpu->last_pt_write_count >= 3)
@@ -1181,10 +1215,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1181 spte = &page->spt[page_offset / sizeof(*spte)]; 1215 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) { 1216 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte); 1217 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); 1218 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes,
1219 page_offset & (pte_size - 1));
1185 ++spte; 1220 ++spte;
1186 } 1221 }
1187 } 1222 }
1223 kvm_mmu_audit(vcpu, "post pte write");
1188} 1224}
1189 1225
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 1226int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1359,22 +1395,33 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { 1395 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i]; 1396 u64 ent = pt[i];
1361 1397
1362 if (!(ent & PT_PRESENT_MASK)) 1398 if (ent == shadow_trap_nonpresent_pte)
1363 continue; 1399 continue;
1364 1400
1365 va = canonicalize(va); 1401 va = canonicalize(va);
1366 if (level > 1) 1402 if (level > 1) {
1403 if (ent == shadow_notrap_nonpresent_pte)
1404 printk(KERN_ERR "audit: (%s) nontrapping pte"
1405 " in nonleaf level: levels %d gva %lx"
1406 " level %d pte %llx\n", audit_msg,
1407 vcpu->mmu.root_level, va, level, ent);
1408
1367 audit_mappings_page(vcpu, ent, va, level - 1); 1409 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else { 1410 } else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); 1411 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa); 1412 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371 1413
1372 if ((ent & PT_PRESENT_MASK) 1414 if (is_shadow_present_pte(ent)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa) 1415 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d" 1416 printk(KERN_ERR "xx audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n", 1417 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1376 audit_msg, vcpu->mmu.root_level, 1418 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent); 1419 va, gpa, hpa, ent, is_shadow_present_pte(ent));
1420 else if (ent == shadow_notrap_nonpresent_pte
1421 && !is_error_hpa(hpa))
1422 printk(KERN_ERR "audit: (%s) notrap shadow,"
1423 " valid guest gva %lx\n", audit_msg, va);
1424
1378 } 1425 }
1379 } 1426 }
1380} 1427}