aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-09-16 12:58:32 -0400
committerAvi Kivity <avi@qumranet.com>2008-01-30 10:52:48 -0500
commitc7addb902054195b995114df154e061c7d604f69 (patch)
tree985910a6c970957126c91e55c55b0e73ae877e0c
parent51c6cf662b4b361a09fbd324f4c67875d9bcfbea (diff)
KVM: Allow not-present guest page faults to bypass kvm
There are two classes of page faults trapped by kvm: - host page faults, where the fault is needed to allow kvm to install the shadow pte or update the guest accessed and dirty bits - guest page faults, where the guest has faulted and kvm simply injects the fault back into the guest to handle The second class, guest page faults, is pure overhead. We can eliminate some of it on vmx using the following evil trick: - when we set up a shadow page table entry, if the corresponding guest pte is not present, set up the shadow pte as not present - if the guest pte _is_ present, mark the shadow pte as present but also set one of the reserved bits in the shadow pte - tell the vmx hardware not to trap faults which have the present bit clear With this, normal page-not-present faults go directly to the guest, bypassing kvm entirely. Unfortunately, this trick only works on Intel hardware, as AMD lacks a way to discriminate among page faults based on error code. It is also a little risky since it uses reserved bits which might become unreserved in the future, so a module parameter is provided to disable it. Signed-off-by: Avi Kivity <avi@qumranet.com>
-rw-r--r--drivers/kvm/kvm.h3
-rw-r--r--drivers/kvm/kvm_main.c4
-rw-r--r--drivers/kvm/mmu.c89
-rw-r--r--drivers/kvm/paging_tmpl.h52
-rw-r--r--drivers/kvm/vmx.c11
5 files changed, 122 insertions, 37 deletions
diff --git a/drivers/kvm/kvm.h b/drivers/kvm/kvm.h
index e885b190b798..7de948e9e64e 100644
--- a/drivers/kvm/kvm.h
+++ b/drivers/kvm/kvm.h
@@ -150,6 +150,8 @@ struct kvm_mmu {
150 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 150 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
151 void (*free)(struct kvm_vcpu *vcpu); 151 void (*free)(struct kvm_vcpu *vcpu);
152 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 152 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
153 void (*prefetch_page)(struct kvm_vcpu *vcpu,
154 struct kvm_mmu_page *page);
153 hpa_t root_hpa; 155 hpa_t root_hpa;
154 int root_level; 156 int root_level;
155 int shadow_root_level; 157 int shadow_root_level;
@@ -536,6 +538,7 @@ void kvm_mmu_module_exit(void);
536void kvm_mmu_destroy(struct kvm_vcpu *vcpu); 538void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
537int kvm_mmu_create(struct kvm_vcpu *vcpu); 539int kvm_mmu_create(struct kvm_vcpu *vcpu);
538int kvm_mmu_setup(struct kvm_vcpu *vcpu); 540int kvm_mmu_setup(struct kvm_vcpu *vcpu);
541void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
539 542
540int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 543int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
541void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); 544void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index 710483669f34..82cc7ae0fc83 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -3501,7 +3501,9 @@ int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3501 kvm_preempt_ops.sched_in = kvm_sched_in; 3501 kvm_preempt_ops.sched_in = kvm_sched_in;
3502 kvm_preempt_ops.sched_out = kvm_sched_out; 3502 kvm_preempt_ops.sched_out = kvm_sched_out;
3503 3503
3504 return r; 3504 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3505
3506 return 0;
3505 3507
3506out_free: 3508out_free:
3507 kmem_cache_destroy(kvm_vcpu_cache); 3509 kmem_cache_destroy(kvm_vcpu_cache);
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index feb5ac986c5d..069ce83f018e 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -156,6 +156,16 @@ static struct kmem_cache *pte_chain_cache;
156static struct kmem_cache *rmap_desc_cache; 156static struct kmem_cache *rmap_desc_cache;
157static struct kmem_cache *mmu_page_header_cache; 157static struct kmem_cache *mmu_page_header_cache;
158 158
159static u64 __read_mostly shadow_trap_nonpresent_pte;
160static u64 __read_mostly shadow_notrap_nonpresent_pte;
161
162void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
163{
164 shadow_trap_nonpresent_pte = trap_pte;
165 shadow_notrap_nonpresent_pte = notrap_pte;
166}
167EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
168
159static int is_write_protection(struct kvm_vcpu *vcpu) 169static int is_write_protection(struct kvm_vcpu *vcpu)
160{ 170{
161 return vcpu->cr0 & X86_CR0_WP; 171 return vcpu->cr0 & X86_CR0_WP;
@@ -176,6 +186,13 @@ static int is_present_pte(unsigned long pte)
176 return pte & PT_PRESENT_MASK; 186 return pte & PT_PRESENT_MASK;
177} 187}
178 188
189static int is_shadow_present_pte(u64 pte)
190{
191 pte &= ~PT_SHADOW_IO_MARK;
192 return pte != shadow_trap_nonpresent_pte
193 && pte != shadow_notrap_nonpresent_pte;
194}
195
179static int is_writeble_pte(unsigned long pte) 196static int is_writeble_pte(unsigned long pte)
180{ 197{
181 return pte & PT_WRITABLE_MASK; 198 return pte & PT_WRITABLE_MASK;
@@ -450,7 +467,7 @@ static int is_empty_shadow_page(u64 *spt)
450 u64 *end; 467 u64 *end;
451 468
452 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 469 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
453 if (*pos != 0) { 470 if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
454 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, 471 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
455 pos, *pos); 472 pos, *pos);
456 return 0; 473 return 0;
@@ -632,6 +649,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
632 page->gfn = gfn; 649 page->gfn = gfn;
633 page->role = role; 650 page->role = role;
634 hlist_add_head(&page->hash_link, bucket); 651 hlist_add_head(&page->hash_link, bucket);
652 vcpu->mmu.prefetch_page(vcpu, page);
635 if (!metaphysical) 653 if (!metaphysical)
636 rmap_write_protect(vcpu, gfn); 654 rmap_write_protect(vcpu, gfn);
637 return page; 655 return page;
@@ -648,9 +666,9 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
648 666
649 if (page->role.level == PT_PAGE_TABLE_LEVEL) { 667 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
650 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 668 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (pt[i] & PT_PRESENT_MASK) 669 if (is_shadow_present_pte(pt[i]))
652 rmap_remove(&pt[i]); 670 rmap_remove(&pt[i]);
653 pt[i] = 0; 671 pt[i] = shadow_trap_nonpresent_pte;
654 } 672 }
655 kvm_flush_remote_tlbs(kvm); 673 kvm_flush_remote_tlbs(kvm);
656 return; 674 return;
@@ -659,8 +677,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
659 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 677 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
660 ent = pt[i]; 678 ent = pt[i];
661 679
662 pt[i] = 0; 680 pt[i] = shadow_trap_nonpresent_pte;
663 if (!(ent & PT_PRESENT_MASK)) 681 if (!is_shadow_present_pte(ent))
664 continue; 682 continue;
665 ent &= PT64_BASE_ADDR_MASK; 683 ent &= PT64_BASE_ADDR_MASK;
666 mmu_page_remove_parent_pte(page_header(ent), &pt[i]); 684 mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
@@ -691,7 +709,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm,
691 } 709 }
692 BUG_ON(!parent_pte); 710 BUG_ON(!parent_pte);
693 kvm_mmu_put_page(page, parent_pte); 711 kvm_mmu_put_page(page, parent_pte);
694 set_shadow_pte(parent_pte, 0); 712 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
695 } 713 }
696 kvm_mmu_page_unlink_children(kvm, page); 714 kvm_mmu_page_unlink_children(kvm, page);
697 if (!page->root_count) { 715 if (!page->root_count) {
@@ -798,7 +816,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
798 816
799 if (level == 1) { 817 if (level == 1) {
800 pte = table[index]; 818 pte = table[index];
801 if (is_present_pte(pte) && is_writeble_pte(pte)) 819 if (is_shadow_present_pte(pte) && is_writeble_pte(pte))
802 return 0; 820 return 0;
803 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT); 821 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
804 page_header_update_slot(vcpu->kvm, table, v); 822 page_header_update_slot(vcpu->kvm, table, v);
@@ -808,7 +826,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
808 return 0; 826 return 0;
809 } 827 }
810 828
811 if (table[index] == 0) { 829 if (table[index] == shadow_trap_nonpresent_pte) {
812 struct kvm_mmu_page *new_table; 830 struct kvm_mmu_page *new_table;
813 gfn_t pseudo_gfn; 831 gfn_t pseudo_gfn;
814 832
@@ -829,6 +847,15 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
829 } 847 }
830} 848}
831 849
850static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
851 struct kvm_mmu_page *sp)
852{
853 int i;
854
855 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
856 sp->spt[i] = shadow_trap_nonpresent_pte;
857}
858
832static void mmu_free_roots(struct kvm_vcpu *vcpu) 859static void mmu_free_roots(struct kvm_vcpu *vcpu)
833{ 860{
834 int i; 861 int i;
@@ -943,6 +970,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
943 context->page_fault = nonpaging_page_fault; 970 context->page_fault = nonpaging_page_fault;
944 context->gva_to_gpa = nonpaging_gva_to_gpa; 971 context->gva_to_gpa = nonpaging_gva_to_gpa;
945 context->free = nonpaging_free; 972 context->free = nonpaging_free;
973 context->prefetch_page = nonpaging_prefetch_page;
946 context->root_level = 0; 974 context->root_level = 0;
947 context->shadow_root_level = PT32E_ROOT_LEVEL; 975 context->shadow_root_level = PT32E_ROOT_LEVEL;
948 context->root_hpa = INVALID_PAGE; 976 context->root_hpa = INVALID_PAGE;
@@ -989,6 +1017,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
989 context->new_cr3 = paging_new_cr3; 1017 context->new_cr3 = paging_new_cr3;
990 context->page_fault = paging64_page_fault; 1018 context->page_fault = paging64_page_fault;
991 context->gva_to_gpa = paging64_gva_to_gpa; 1019 context->gva_to_gpa = paging64_gva_to_gpa;
1020 context->prefetch_page = paging64_prefetch_page;
992 context->free = paging_free; 1021 context->free = paging_free;
993 context->root_level = level; 1022 context->root_level = level;
994 context->shadow_root_level = level; 1023 context->shadow_root_level = level;
@@ -1009,6 +1038,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1009 context->page_fault = paging32_page_fault; 1038 context->page_fault = paging32_page_fault;
1010 context->gva_to_gpa = paging32_gva_to_gpa; 1039 context->gva_to_gpa = paging32_gva_to_gpa;
1011 context->free = paging_free; 1040 context->free = paging_free;
1041 context->prefetch_page = paging32_prefetch_page;
1012 context->root_level = PT32_ROOT_LEVEL; 1042 context->root_level = PT32_ROOT_LEVEL;
1013 context->shadow_root_level = PT32E_ROOT_LEVEL; 1043 context->shadow_root_level = PT32E_ROOT_LEVEL;
1014 context->root_hpa = INVALID_PAGE; 1044 context->root_hpa = INVALID_PAGE;
@@ -1081,7 +1111,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1081 struct kvm_mmu_page *child; 1111 struct kvm_mmu_page *child;
1082 1112
1083 pte = *spte; 1113 pte = *spte;
1084 if (is_present_pte(pte)) { 1114 if (is_shadow_present_pte(pte)) {
1085 if (page->role.level == PT_PAGE_TABLE_LEVEL) 1115 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1086 rmap_remove(spte); 1116 rmap_remove(spte);
1087 else { 1117 else {
@@ -1089,22 +1119,25 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1089 mmu_page_remove_parent_pte(child, spte); 1119 mmu_page_remove_parent_pte(child, spte);
1090 } 1120 }
1091 } 1121 }
1092 set_shadow_pte(spte, 0); 1122 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
1093 kvm_flush_remote_tlbs(vcpu->kvm); 1123 kvm_flush_remote_tlbs(vcpu->kvm);
1094} 1124}
1095 1125
1096static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 1126static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1097 struct kvm_mmu_page *page, 1127 struct kvm_mmu_page *page,
1098 u64 *spte, 1128 u64 *spte,
1099 const void *new, int bytes) 1129 const void *new, int bytes,
1130 int offset_in_pte)
1100{ 1131{
1101 if (page->role.level != PT_PAGE_TABLE_LEVEL) 1132 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1102 return; 1133 return;
1103 1134
1104 if (page->role.glevels == PT32_ROOT_LEVEL) 1135 if (page->role.glevels == PT32_ROOT_LEVEL)
1105 paging32_update_pte(vcpu, page, spte, new, bytes); 1136 paging32_update_pte(vcpu, page, spte, new, bytes,
1137 offset_in_pte);
1106 else 1138 else
1107 paging64_update_pte(vcpu, page, spte, new, bytes); 1139 paging64_update_pte(vcpu, page, spte, new, bytes,
1140 offset_in_pte);
1108} 1141}
1109 1142
1110void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 1143void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1126,6 +1159,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1126 int npte; 1159 int npte;
1127 1160
1128 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); 1161 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1162 kvm_mmu_audit(vcpu, "pre pte write");
1129 if (gfn == vcpu->last_pt_write_gfn) { 1163 if (gfn == vcpu->last_pt_write_gfn) {
1130 ++vcpu->last_pt_write_count; 1164 ++vcpu->last_pt_write_count;
1131 if (vcpu->last_pt_write_count >= 3) 1165 if (vcpu->last_pt_write_count >= 3)
@@ -1181,10 +1215,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1181 spte = &page->spt[page_offset / sizeof(*spte)]; 1215 spte = &page->spt[page_offset / sizeof(*spte)];
1182 while (npte--) { 1216 while (npte--) {
1183 mmu_pte_write_zap_pte(vcpu, page, spte); 1217 mmu_pte_write_zap_pte(vcpu, page, spte);
1184 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes); 1218 mmu_pte_write_new_pte(vcpu, page, spte, new, bytes,
1219 page_offset & (pte_size - 1));
1185 ++spte; 1220 ++spte;
1186 } 1221 }
1187 } 1222 }
1223 kvm_mmu_audit(vcpu, "post pte write");
1188} 1224}
1189 1225
1190int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 1226int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -1359,22 +1395,33 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1359 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { 1395 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1360 u64 ent = pt[i]; 1396 u64 ent = pt[i];
1361 1397
1362 if (!(ent & PT_PRESENT_MASK)) 1398 if (ent == shadow_trap_nonpresent_pte)
1363 continue; 1399 continue;
1364 1400
1365 va = canonicalize(va); 1401 va = canonicalize(va);
1366 if (level > 1) 1402 if (level > 1) {
1403 if (ent == shadow_notrap_nonpresent_pte)
1404 printk(KERN_ERR "audit: (%s) nontrapping pte"
1405 " in nonleaf level: levels %d gva %lx"
1406 " level %d pte %llx\n", audit_msg,
1407 vcpu->mmu.root_level, va, level, ent);
1408
1367 audit_mappings_page(vcpu, ent, va, level - 1); 1409 audit_mappings_page(vcpu, ent, va, level - 1);
1368 else { 1410 } else {
1369 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); 1411 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1370 hpa_t hpa = gpa_to_hpa(vcpu, gpa); 1412 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1371 1413
1372 if ((ent & PT_PRESENT_MASK) 1414 if (is_shadow_present_pte(ent)
1373 && (ent & PT64_BASE_ADDR_MASK) != hpa) 1415 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1374 printk(KERN_ERR "audit error: (%s) levels %d" 1416 printk(KERN_ERR "xx audit error: (%s) levels %d"
1375 " gva %lx gpa %llx hpa %llx ent %llx\n", 1417 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
1376 audit_msg, vcpu->mmu.root_level, 1418 audit_msg, vcpu->mmu.root_level,
1377 va, gpa, hpa, ent); 1419 va, gpa, hpa, ent, is_shadow_present_pte(ent));
1420 else if (ent == shadow_notrap_nonpresent_pte
1421 && !is_error_hpa(hpa))
1422 printk(KERN_ERR "audit: (%s) notrap shadow,"
1423 " valid guest gva %lx\n", audit_msg, va);
1424
1378 } 1425 }
1379 } 1426 }
1380} 1427}
diff --git a/drivers/kvm/paging_tmpl.h b/drivers/kvm/paging_tmpl.h
index 6b094b44f8fb..99ac9b15f773 100644
--- a/drivers/kvm/paging_tmpl.h
+++ b/drivers/kvm/paging_tmpl.h
@@ -31,6 +31,7 @@
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
34 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
35 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
36 #else 37 #else
@@ -45,6 +46,7 @@
45 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 46 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
46 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 47 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
47 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 48 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
49 #define PT_LEVEL_BITS PT32_LEVEL_BITS
48 #define PT_MAX_FULL_LEVELS 2 50 #define PT_MAX_FULL_LEVELS 2
49#else 51#else
50 #error Invalid PTTYPE value 52 #error Invalid PTTYPE value
@@ -211,12 +213,12 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
211{ 213{
212 hpa_t paddr; 214 hpa_t paddr;
213 int dirty = gpte & PT_DIRTY_MASK; 215 int dirty = gpte & PT_DIRTY_MASK;
214 u64 spte = *shadow_pte; 216 u64 spte;
215 int was_rmapped = is_rmap_pte(spte); 217 int was_rmapped = is_rmap_pte(*shadow_pte);
216 218
217 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d" 219 pgprintk("%s: spte %llx gpte %llx access %llx write_fault %d"
218 " user_fault %d gfn %lx\n", 220 " user_fault %d gfn %lx\n",
219 __FUNCTION__, spte, (u64)gpte, access_bits, 221 __FUNCTION__, *shadow_pte, (u64)gpte, access_bits,
220 write_fault, user_fault, gfn); 222 write_fault, user_fault, gfn);
221 223
222 if (write_fault && !dirty) { 224 if (write_fault && !dirty) {
@@ -236,7 +238,7 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
236 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); 238 FNAME(mark_pagetable_dirty)(vcpu->kvm, walker);
237 } 239 }
238 240
239 spte |= PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK; 241 spte = PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK;
240 spte |= gpte & PT64_NX_MASK; 242 spte |= gpte & PT64_NX_MASK;
241 if (!dirty) 243 if (!dirty)
242 access_bits &= ~PT_WRITABLE_MASK; 244 access_bits &= ~PT_WRITABLE_MASK;
@@ -248,10 +250,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
248 spte |= PT_USER_MASK; 250 spte |= PT_USER_MASK;
249 251
250 if (is_error_hpa(paddr)) { 252 if (is_error_hpa(paddr)) {
251 spte |= gaddr; 253 set_shadow_pte(shadow_pte,
252 spte |= PT_SHADOW_IO_MARK; 254 shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
253 spte &= ~PT_PRESENT_MASK;
254 set_shadow_pte(shadow_pte, spte);
255 return; 255 return;
256 } 256 }
257 257
@@ -286,6 +286,7 @@ unshadowed:
286 if (access_bits & PT_WRITABLE_MASK) 286 if (access_bits & PT_WRITABLE_MASK)
287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT); 287 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
288 288
289 pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
289 set_shadow_pte(shadow_pte, spte); 290 set_shadow_pte(shadow_pte, spte);
290 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); 291 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
291 if (!was_rmapped) 292 if (!was_rmapped)
@@ -304,14 +305,18 @@ static void FNAME(set_pte)(struct kvm_vcpu *vcpu, pt_element_t gpte,
304} 305}
305 306
306static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 307static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
307 u64 *spte, const void *pte, int bytes) 308 u64 *spte, const void *pte, int bytes,
309 int offset_in_pte)
308{ 310{
309 pt_element_t gpte; 311 pt_element_t gpte;
310 312
311 if (bytes < sizeof(pt_element_t))
312 return;
313 gpte = *(const pt_element_t *)pte; 313 gpte = *(const pt_element_t *)pte;
314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) 314 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
315 if (!offset_in_pte && !is_present_pte(gpte))
316 set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
317 return;
318 }
319 if (bytes < sizeof(pt_element_t))
315 return; 320 return;
316 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); 321 pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
317 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0, 322 FNAME(set_pte)(vcpu, gpte, spte, PT_USER_MASK | PT_WRITABLE_MASK, 0,
@@ -368,7 +373,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
368 unsigned hugepage_access = 0; 373 unsigned hugepage_access = 0;
369 374
370 shadow_ent = ((u64 *)__va(shadow_addr)) + index; 375 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
371 if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) { 376 if (is_shadow_present_pte(*shadow_ent)) {
372 if (level == PT_PAGE_TABLE_LEVEL) 377 if (level == PT_PAGE_TABLE_LEVEL)
373 break; 378 break;
374 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 379 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
@@ -500,6 +505,26 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
500 return gpa; 505 return gpa;
501} 506}
502 507
508static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
509 struct kvm_mmu_page *sp)
510{
511 int i;
512 pt_element_t *gpt;
513
514 if (sp->role.metaphysical || PTTYPE == 32) {
515 nonpaging_prefetch_page(vcpu, sp);
516 return;
517 }
518
519 gpt = kmap_atomic(gfn_to_page(vcpu->kvm, sp->gfn), KM_USER0);
520 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
521 if (is_present_pte(gpt[i]))
522 sp->spt[i] = shadow_trap_nonpresent_pte;
523 else
524 sp->spt[i] = shadow_notrap_nonpresent_pte;
525 kunmap_atomic(gpt, KM_USER0);
526}
527
503#undef pt_element_t 528#undef pt_element_t
504#undef guest_walker 529#undef guest_walker
505#undef FNAME 530#undef FNAME
@@ -508,4 +533,5 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
508#undef SHADOW_PT_INDEX 533#undef SHADOW_PT_INDEX
509#undef PT_LEVEL_MASK 534#undef PT_LEVEL_MASK
510#undef PT_DIR_BASE_ADDR_MASK 535#undef PT_DIR_BASE_ADDR_MASK
536#undef PT_LEVEL_BITS
511#undef PT_MAX_FULL_LEVELS 537#undef PT_MAX_FULL_LEVELS
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 8eb49e055ec0..27a3318fa6c2 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -26,6 +26,7 @@
26#include <linux/mm.h> 26#include <linux/mm.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/sched.h> 28#include <linux/sched.h>
29#include <linux/moduleparam.h>
29 30
30#include <asm/io.h> 31#include <asm/io.h>
31#include <asm/desc.h> 32#include <asm/desc.h>
@@ -33,6 +34,9 @@
33MODULE_AUTHOR("Qumranet"); 34MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
35 36
37static int bypass_guest_pf = 1;
38module_param(bypass_guest_pf, bool, 0);
39
36struct vmcs { 40struct vmcs {
37 u32 revision_id; 41 u32 revision_id;
38 u32 abort; 42 u32 abort;
@@ -1535,8 +1539,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1535 } 1539 }
1536 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 1540 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1537 1541
1538 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 1542 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
1539 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 1543 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
1540 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 1544 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
1541 1545
1542 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 1546 vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */
@@ -2582,6 +2586,9 @@ static int __init vmx_init(void)
2582 if (r) 2586 if (r)
2583 goto out1; 2587 goto out1;
2584 2588
2589 if (bypass_guest_pf)
2590 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
2591
2585 return 0; 2592 return 0;
2586 2593
2587out1: 2594out1: