diff options
author | Avi Kivity <avi@qumranet.com> | 2007-09-16 12:58:32 -0400 |
---|---|---|
committer | Avi Kivity <avi@qumranet.com> | 2008-01-30 10:52:48 -0500 |
commit | c7addb902054195b995114df154e061c7d604f69 (patch) | |
tree | 985910a6c970957126c91e55c55b0e73ae877e0c /drivers/kvm/mmu.c | |
parent | 51c6cf662b4b361a09fbd324f4c67875d9bcfbea (diff) |
KVM: Allow not-present guest page faults to bypass kvm
There are two classes of page faults trapped by kvm:
- host page faults, where the fault is needed to allow kvm to install
the shadow pte or update the guest accessed and dirty bits
- guest page faults, where the guest has faulted and kvm simply injects
the fault back into the guest to handle
The second class, guest page faults, is pure overhead. We can eliminate
some of it on vmx using the following evil trick:
- when we set up a shadow page table entry, if the corresponding guest pte
is not present, set up the shadow pte as not present
- if the guest pte _is_ present, mark the shadow pte as present but also
set one of the reserved bits in the shadow pte
- tell the vmx hardware not to trap faults which have the present bit clear
With this, normal page-not-present faults go directly to the guest,
bypassing kvm entirely.
Unfortunately, this trick only works on Intel hardware, as AMD lacks a
way to discriminate among page faults based on error code. It is also
a little risky since it uses reserved bits which might become unreserved
in the future, so a module parameter is provided to disable it.
Signed-off-by: Avi Kivity <avi@qumranet.com>
Diffstat (limited to 'drivers/kvm/mmu.c')
-rw-r--r-- | drivers/kvm/mmu.c | 89 |
1 files changed, 68 insertions, 21 deletions
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index feb5ac986c5d..069ce83f018e 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c | |||
@@ -156,6 +156,16 @@ static struct kmem_cache *pte_chain_cache; | |||
156 | static struct kmem_cache *rmap_desc_cache; | 156 | static struct kmem_cache *rmap_desc_cache; |
157 | static struct kmem_cache *mmu_page_header_cache; | 157 | static struct kmem_cache *mmu_page_header_cache; |
158 | 158 | ||
159 | static u64 __read_mostly shadow_trap_nonpresent_pte; | ||
160 | static u64 __read_mostly shadow_notrap_nonpresent_pte; | ||
161 | |||
162 | void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) | ||
163 | { | ||
164 | shadow_trap_nonpresent_pte = trap_pte; | ||
165 | shadow_notrap_nonpresent_pte = notrap_pte; | ||
166 | } | ||
167 | EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); | ||
168 | |||
159 | static int is_write_protection(struct kvm_vcpu *vcpu) | 169 | static int is_write_protection(struct kvm_vcpu *vcpu) |
160 | { | 170 | { |
161 | return vcpu->cr0 & X86_CR0_WP; | 171 | return vcpu->cr0 & X86_CR0_WP; |
@@ -176,6 +186,13 @@ static int is_present_pte(unsigned long pte) | |||
176 | return pte & PT_PRESENT_MASK; | 186 | return pte & PT_PRESENT_MASK; |
177 | } | 187 | } |
178 | 188 | ||
189 | static int is_shadow_present_pte(u64 pte) | ||
190 | { | ||
191 | pte &= ~PT_SHADOW_IO_MARK; | ||
192 | return pte != shadow_trap_nonpresent_pte | ||
193 | && pte != shadow_notrap_nonpresent_pte; | ||
194 | } | ||
195 | |||
179 | static int is_writeble_pte(unsigned long pte) | 196 | static int is_writeble_pte(unsigned long pte) |
180 | { | 197 | { |
181 | return pte & PT_WRITABLE_MASK; | 198 | return pte & PT_WRITABLE_MASK; |
@@ -450,7 +467,7 @@ static int is_empty_shadow_page(u64 *spt) | |||
450 | u64 *end; | 467 | u64 *end; |
451 | 468 | ||
452 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) | 469 | for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) |
453 | if (*pos != 0) { | 470 | if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { |
454 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, | 471 | printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, |
455 | pos, *pos); | 472 | pos, *pos); |
456 | return 0; | 473 | return 0; |
@@ -632,6 +649,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
632 | page->gfn = gfn; | 649 | page->gfn = gfn; |
633 | page->role = role; | 650 | page->role = role; |
634 | hlist_add_head(&page->hash_link, bucket); | 651 | hlist_add_head(&page->hash_link, bucket); |
652 | vcpu->mmu.prefetch_page(vcpu, page); | ||
635 | if (!metaphysical) | 653 | if (!metaphysical) |
636 | rmap_write_protect(vcpu, gfn); | 654 | rmap_write_protect(vcpu, gfn); |
637 | return page; | 655 | return page; |
@@ -648,9 +666,9 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
648 | 666 | ||
649 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { | 667 | if (page->role.level == PT_PAGE_TABLE_LEVEL) { |
650 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 668 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
651 | if (pt[i] & PT_PRESENT_MASK) | 669 | if (is_shadow_present_pte(pt[i])) |
652 | rmap_remove(&pt[i]); | 670 | rmap_remove(&pt[i]); |
653 | pt[i] = 0; | 671 | pt[i] = shadow_trap_nonpresent_pte; |
654 | } | 672 | } |
655 | kvm_flush_remote_tlbs(kvm); | 673 | kvm_flush_remote_tlbs(kvm); |
656 | return; | 674 | return; |
@@ -659,8 +677,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, | |||
659 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { | 677 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { |
660 | ent = pt[i]; | 678 | ent = pt[i]; |
661 | 679 | ||
662 | pt[i] = 0; | 680 | pt[i] = shadow_trap_nonpresent_pte; |
663 | if (!(ent & PT_PRESENT_MASK)) | 681 | if (!is_shadow_present_pte(ent)) |
664 | continue; | 682 | continue; |
665 | ent &= PT64_BASE_ADDR_MASK; | 683 | ent &= PT64_BASE_ADDR_MASK; |
666 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); | 684 | mmu_page_remove_parent_pte(page_header(ent), &pt[i]); |
@@ -691,7 +709,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, | |||
691 | } | 709 | } |
692 | BUG_ON(!parent_pte); | 710 | BUG_ON(!parent_pte); |
693 | kvm_mmu_put_page(page, parent_pte); | 711 | kvm_mmu_put_page(page, parent_pte); |
694 | set_shadow_pte(parent_pte, 0); | 712 | set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); |
695 | } | 713 | } |
696 | kvm_mmu_page_unlink_children(kvm, page); | 714 | kvm_mmu_page_unlink_children(kvm, page); |
697 | if (!page->root_count) { | 715 | if (!page->root_count) { |
@@ -798,7 +816,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
798 | 816 | ||
799 | if (level == 1) { | 817 | if (level == 1) { |
800 | pte = table[index]; | 818 | pte = table[index]; |
801 | if (is_present_pte(pte) && is_writeble_pte(pte)) | 819 | if (is_shadow_present_pte(pte) && is_writeble_pte(pte)) |
802 | return 0; | 820 | return 0; |
803 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); | 821 | mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); |
804 | page_header_update_slot(vcpu->kvm, table, v); | 822 | page_header_update_slot(vcpu->kvm, table, v); |
@@ -808,7 +826,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
808 | return 0; | 826 | return 0; |
809 | } | 827 | } |
810 | 828 | ||
811 | if (table[index] == 0) { | 829 | if (table[index] == shadow_trap_nonpresent_pte) { |
812 | struct kvm_mmu_page *new_table; | 830 | struct kvm_mmu_page *new_table; |
813 | gfn_t pseudo_gfn; | 831 | gfn_t pseudo_gfn; |
814 | 832 | ||
@@ -829,6 +847,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) | |||
829 | } | 847 | } |
830 | } | 848 | } |
831 | 849 | ||
850 | static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, | ||
851 | struct kvm_mmu_page *sp) | ||
852 | { | ||
853 | int i; | ||
854 | |||
855 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i) | ||
856 | sp->spt[i] = shadow_trap_nonpresent_pte; | ||
857 | } | ||
858 | |||
832 | static void mmu_free_roots(struct kvm_vcpu *vcpu) | 859 | static void mmu_free_roots(struct kvm_vcpu *vcpu) |
833 | { | 860 | { |
834 | int i; | 861 | int i; |
@@ -943,6 +970,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) | |||
943 | context->page_fault = nonpaging_page_fault; | 970 | context->page_fault = nonpaging_page_fault; |
944 | context->gva_to_gpa = nonpaging_gva_to_gpa; | 971 | context->gva_to_gpa = nonpaging_gva_to_gpa; |
945 | context->free = nonpaging_free; | 972 | context->free = nonpaging_free; |
973 | context->prefetch_page = nonpaging_prefetch_page; | ||
946 | context->root_level = 0; | 974 | context->root_level = 0; |
947 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 975 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
948 | context->root_hpa = INVALID_PAGE; | 976 | context->root_hpa = INVALID_PAGE; |
@@ -989,6 +1017,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) | |||
989 | context->new_cr3 = paging_new_cr3; | 1017 | context->new_cr3 = paging_new_cr3; |
990 | context->page_fault = paging64_page_fault; | 1018 | context->page_fault = paging64_page_fault; |
991 | context->gva_to_gpa = paging64_gva_to_gpa; | 1019 | context->gva_to_gpa = paging64_gva_to_gpa; |
1020 | context->prefetch_page = paging64_prefetch_page; | ||
992 | context->free = paging_free; | 1021 | context->free = paging_free; |
993 | context->root_level = level; | 1022 | context->root_level = level; |
994 | context->shadow_root_level = level; | 1023 | context->shadow_root_level = level; |
@@ -1009,6 +1038,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) | |||
1009 | context->page_fault = paging32_page_fault; | 1038 | context->page_fault = paging32_page_fault; |
1010 | context->gva_to_gpa = paging32_gva_to_gpa; | 1039 | context->gva_to_gpa = paging32_gva_to_gpa; |
1011 | context->free = paging_free; | 1040 | context->free = paging_free; |
1041 | context->prefetch_page = paging32_prefetch_page; | ||
1012 | context->root_level = PT32_ROOT_LEVEL; | 1042 | context->root_level = PT32_ROOT_LEVEL; |
1013 | context->shadow_root_level = PT32E_ROOT_LEVEL; | 1043 | context->shadow_root_level = PT32E_ROOT_LEVEL; |
1014 | context->root_hpa = INVALID_PAGE; | 1044 | context->root_hpa = INVALID_PAGE; |
@@ -1081,7 +1111,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1081 | struct kvm_mmu_page *child; | 1111 | struct kvm_mmu_page *child; |
1082 | 1112 | ||
1083 | pte = *spte; | 1113 | pte = *spte; |
1084 | if (is_present_pte(pte)) { | 1114 | if (is_shadow_present_pte(pte)) { |
1085 | if (page->role.level == PT_PAGE_TABLE_LEVEL) | 1115 | if (page->role.level == PT_PAGE_TABLE_LEVEL) |
1086 | rmap_remove(spte); | 1116 | rmap_remove(spte); |
1087 | else { | 1117 | else { |
@@ -1089,22 +1119,25 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, | |||
1089 | mmu_page_remove_parent_pte(child, spte); | 1119 | mmu_page_remove_parent_pte(child, spte); |
1090 | } | 1120 | } |
1091 | } | 1121 | } |
1092 | set_shadow_pte(spte, 0); | 1122 | set_shadow_pte(spte, shadow_trap_nonpresent_pte); |
1093 | kvm_flush_remote_tlbs(vcpu->kvm); | 1123 | kvm_flush_remote_tlbs(vcpu->kvm); |
1094 | } | 1124 | } |
1095 | 1125 | ||
1096 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, | 1126 | static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, |
1097 | struct kvm_mmu_page *page, | 1127 | struct kvm_mmu_page *page, |
1098 | u64 *spte, | 1128 | u64 *spte, |
1099 | const void *new, int bytes) | 1129 | const void *new, int bytes, |
1130 | int offset_in_pte) | ||
1100 | { | 1131 | { |
1101 | if (page->role.level != PT_PAGE_TABLE_LEVEL) | 1132 | if (page->role.level != PT_PAGE_TABLE_LEVEL) |
1102 | return; | 1133 | return; |
1103 | 1134 | ||
1104 | if (page->role.glevels == PT32_ROOT_LEVEL) | 1135 | if (page->role.glevels == PT32_ROOT_LEVEL) |
1105 | paging32_update_pte(vcpu, page, spte, new, bytes); | 1136 | paging32_update_pte(vcpu, page, spte, new, bytes, |
1137 | offset_in_pte); | ||
1106 | else | 1138 | else |
1107 | paging64_update_pte(vcpu, page, spte, new, bytes); | 1139 | paging64_update_pte(vcpu, page, spte, new, bytes, |
1140 | offset_in_pte); | ||
1108 | } | 1141 | } |
1109 | 1142 | ||
1110 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | 1143 | void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, |
@@ -1126,6 +1159,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1126 | int npte; | 1159 | int npte; |
1127 | 1160 | ||
1128 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); | 1161 | pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); |
1162 | kvm_mmu_audit(vcpu, "pre pte write"); | ||
1129 | if (gfn == vcpu->last_pt_write_gfn) { | 1163 | if (gfn == vcpu->last_pt_write_gfn) { |
1130 | ++vcpu->last_pt_write_count; | 1164 | ++vcpu->last_pt_write_count; |
1131 | if (vcpu->last_pt_write_count >= 3) | 1165 | if (vcpu->last_pt_write_count >= 3) |
@@ -1181,10 +1215,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, | |||
1181 | spte = &page->spt[page_offset / sizeof(*spte)]; | 1215 | spte = &page->spt[page_offset / sizeof(*spte)]; |
1182 | while (npte--) { | 1216 | while (npte--) { |
1183 | mmu_pte_write_zap_pte(vcpu, page, spte); | 1217 | mmu_pte_write_zap_pte(vcpu, page, spte); |
1184 | mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); | 1218 | mmu_pte_write_new_pte(vcpu, page, spte, new, bytes, |
1219 | page_offset & (pte_size - 1)); | ||
1185 | ++spte; | 1220 | ++spte; |
1186 | } | 1221 | } |
1187 | } | 1222 | } |
1223 | kvm_mmu_audit(vcpu, "post pte write"); | ||
1188 | } | 1224 | } |
1189 | 1225 | ||
1190 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) | 1226 | int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) |
@@ -1359,22 +1395,33 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, | |||
1359 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { | 1395 | for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { |
1360 | u64 ent = pt[i]; | 1396 | u64 ent = pt[i]; |
1361 | 1397 | ||
1362 | if (!(ent & PT_PRESENT_MASK)) | 1398 | if (ent == shadow_trap_nonpresent_pte) |
1363 | continue; | 1399 | continue; |
1364 | 1400 | ||
1365 | va = canonicalize(va); | 1401 | va = canonicalize(va); |
1366 | if (level > 1) | 1402 | if (level > 1) { |
1403 | if (ent == shadow_notrap_nonpresent_pte) | ||
1404 | printk(KERN_ERR "audit: (%s) nontrapping pte" | ||
1405 | " in nonleaf level: levels %d gva %lx" | ||
1406 | " level %d pte %llx\n", audit_msg, | ||
1407 | vcpu->mmu.root_level, va, level, ent); | ||
1408 | |||
1367 | audit_mappings_page(vcpu, ent, va, level - 1); | 1409 | audit_mappings_page(vcpu, ent, va, level - 1); |
1368 | else { | 1410 | } else { |
1369 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); | 1411 | gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); |
1370 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); | 1412 | hpa_t hpa = gpa_to_hpa(vcpu, gpa); |
1371 | 1413 | ||
1372 | if ((ent & PT_PRESENT_MASK) | 1414 | if (is_shadow_present_pte(ent) |
1373 | && (ent & PT64_BASE_ADDR_MASK) != hpa) | 1415 | && (ent & PT64_BASE_ADDR_MASK) != hpa) |
1374 | printk(KERN_ERR "audit error: (%s) levels %d" | 1416 | printk(KERN_ERR "xx audit error: (%s) levels %d" |
1375 | " gva %lx gpa %llx hpa %llx ent %llx\n", | 1417 | " gva %lx gpa %llx hpa %llx ent %llx %d\n", |
1376 | audit_msg, vcpu->mmu.root_level, | 1418 | audit_msg, vcpu->mmu.root_level, |
1377 | va, gpa, hpa, ent); | 1419 | va, gpa, hpa, ent, is_shadow_present_pte(ent)); |
1420 | else if (ent == shadow_notrap_nonpresent_pte | ||
1421 | && !is_error_hpa(hpa)) | ||
1422 | printk(KERN_ERR "audit: (%s) notrap shadow," | ||
1423 | " valid guest gva %lx\n", audit_msg, va); | ||
1424 | |||
1378 | } | 1425 | } |
1379 | } | 1426 | } |
1380 | } | 1427 | } |