aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c684
1 files changed, 542 insertions, 142 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bfe2bd305eb..99c239c5c0ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
70module_param(dbg, bool, 0644); 70module_param(dbg, bool, 0644);
71#endif 71#endif
72 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
73#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
74#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
75#else 78#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
135#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
136#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137 140
138struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139 void *ptr;
140 unsigned len;
141 unsigned processed;
142 char buf[512] __aligned(sizeof(long));
143};
144 142
145struct kvm_rmap_desc { 143struct kvm_rmap_desc {
146 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
147 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
148}; 146};
149 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
150static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
151static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
152static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
405{ 414{
406 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
407 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
408 418
409 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
410 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
411 return 0; 421 return ret;
412 422
423 down_read(&current->mm->mmap_sem);
413 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
414 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
415 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
416 428
417 return 0; 429 return ret;
418} 430}
419 431
420static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
649 661
650 if (write_protected) 662 if (write_protected)
651 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
652
653 account_shadowed(kvm, gfn);
654} 664}
655 665
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -711,6 +721,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
711 u64 *spte; 721 u64 *spte;
712 int young = 0; 722 int young = 0;
713 723
724 /* always return old for EPT */
725 if (!shadow_accessed_mask)
726 return 0;
727
714 spte = rmap_next(kvm, rmapp, NULL); 728 spte = rmap_next(kvm, rmapp, NULL);
715 while (spte) { 729 while (spte) {
716 int _young; 730 int _young;
@@ -855,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
855 BUG(); 869 BUG();
856} 870}
857 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
858static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
859 struct kvm_mmu_page *sp) 944 struct kvm_mmu_page *sp)
860{ 945{
@@ -864,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
864 sp->spt[i] = shadow_trap_nonpresent_pte; 949 sp->spt[i] = shadow_trap_nonpresent_pte;
865} 950}
866 951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
867static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
868{ 1005{
869 unsigned index; 1006 unsigned index;
@@ -884,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
884 return NULL; 1021 return NULL;
885} 1022}
886 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
887static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
888 gfn_t gfn, 1078 gfn_t gfn,
889 gva_t gaddr, 1079 gva_t gaddr,
@@ -897,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
897 unsigned quadrant; 1087 unsigned quadrant;
898 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
899 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
900 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
901 1091
902 role.word = 0; 1092 role.word = 0;
903 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -913,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
913 gfn, role.word); 1103 gfn, role.word);
914 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
915 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
916 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
917 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
918 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
919 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
920 return sp; 1121 return sp;
921 } 1122 }
@@ -927,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
927 sp->gfn = gfn; 1128 sp->gfn = gfn;
928 sp->role = role; 1129 sp->role = role;
929 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
930 if (!metaphysical) 1131 if (!metaphysical) {
931 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
1133 account_shadowed(vcpu->kvm, gfn);
1134 }
932 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
933 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
934 else 1137 else
@@ -936,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
936 return sp; 1139 return sp;
937} 1140}
938 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
939static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
940 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
941{ 1173{
@@ -951,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
951 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
952 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
953 } 1185 }
954 kvm_flush_remote_tlbs(kvm);
955 return; 1186 return;
956 } 1187 }
957 1188
@@ -970,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
970 } 1201 }
971 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
972 } 1203 }
973 kvm_flush_remote_tlbs(kvm);
974} 1204}
975 1205
976static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -987,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
987 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
988} 1218}
989 1219
990static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
991{ 1221{
992 u64 *parent_pte; 1222 u64 *parent_pte;
993 1223
994 ++kvm->stat.mmu_shadow_zapped;
995 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
996 if (!sp->multimapped) 1225 if (!sp->multimapped)
997 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -1006,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1006 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
1007 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1008 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
1009 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
1010 if (!sp->root_count) { 1281 if (!sp->root_count) {
1011 if (!sp->role.metaphysical && !sp->role.invalid)
1012 unaccount_shadowed(kvm, sp->gfn);
1013 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
1014 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
1015 } else { 1284 } else {
1016 int invalid = sp->role.invalid;
1017 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1018 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1019 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
1020 if (!sp->role.metaphysical && !invalid)
1021 unaccount_shadowed(kvm, sp->gfn);
1022 } 1288 }
1023 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
1024} 1291}
1025 1292
1026/* 1293/*
@@ -1073,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1073 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
1074 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1075 sp->role.word); 1342 sp->role.word);
1076 kvm_mmu_zap_page(kvm, sp);
1077 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
1078 } 1346 }
1079 return r; 1347 return r;
1080} 1348}
@@ -1097,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1097 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1098} 1366}
1099 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1100struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1101{ 1383{
1102 struct page *page; 1384 struct page *page;
@@ -1106,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1106 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1107 return NULL; 1389 return NULL;
1108 1390
1109 down_read(&current->mm->mmap_sem);
1110 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1111 up_read(&current->mm->mmap_sem);
1112 1392
1113 return page; 1393 return page;
1114} 1394}
1115 1395
1116static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1117 unsigned pt_access, unsigned pte_access,
1118 int user_fault, int write_fault, int dirty,
1119 int *ptwrite, int largepage, gfn_t gfn,
1120 pfn_t pfn, bool speculative)
1121{ 1397{
1122 u64 spte; 1398 unsigned index;
1123 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1124 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1125 1402
1126 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1127 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1128 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1129 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1130 1418
1131 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1132 /* 1420 bool can_unsync)
1133 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1134 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1135 */
1136 if (largepage && !is_large_pte(*shadow_pte)) {
1137 struct kvm_mmu_page *child;
1138 u64 pte = *shadow_pte;
1139 1423
1140 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1141 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1142 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1143 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1144 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1145 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1146 } else { 1430 if (can_unsync && oos_shadow)
1147 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1148 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1149 else
1150 was_rmapped = 1;
1151 }
1152 } 1433 }
1434 return 0;
1435}
1153 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1154 /* 1445 /*
1155 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1156 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1158,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1158 */ 1449 */
1159 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1160 if (!speculative) 1451 if (!speculative)
1161 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1162 if (!dirty) 1453 if (!dirty)
1163 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1164 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1174,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1174 1465
1175 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1176 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1177 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1178 1474
1179 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1180 1476
1181 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1182 if (shadow ||
1183 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1184 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1185 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1186 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1187 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1188 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1189 kvm_x86_ops->tlb_flush(vcpu);
1190 }
1191 if (write_fault)
1192 *ptwrite = 1;
1193 } 1484 }
1194 } 1485 }
1195 1486
1196 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1197 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1198 1489
1199 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1200 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1201 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1202 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1203 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1204 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1205 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1206 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1207 1545
1208 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1226,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1226{ 1564{
1227} 1565}
1228 1566
1229static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1230 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1231 int level) 1569 pfn_t pfn;
1232{ 1570 int write;
1233 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1234 int pt_write = 0; 1572 int pt_write;
1235 1573};
1236 for (; ; level--) {
1237 u32 index = PT64_INDEX(v, level);
1238 u64 *table;
1239
1240 ASSERT(VALID_PAGE(table_addr));
1241 table = __va(table_addr);
1242 1574
1243 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1244 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1245 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1246 return pt_write; 1578{
1247 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1248 1593
1249 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1250 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1251 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1252 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1253 } 1602 }
1254 1603
1255 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1256 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1257 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1258 1607 | shadow_user_mask | shadow_x_mask);
1259 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1260 >> PAGE_SHIFT;
1261 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1262 v, level - 1,
1263 1, ACC_ALL, &table[index]);
1264 if (!new_table) {
1265 pgprintk("nonpaging_map: ENOMEM\n");
1266 kvm_release_pfn_clean(pfn);
1267 return -ENOMEM;
1268 }
1269
1270 set_shadow_pte(&table[index],
1271 __pa(new_table->spt)
1272 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1273 | shadow_user_mask | shadow_x_mask);
1274 }
1275 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1276 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1277} 1628}
1278 1629
1279static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1283,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1283 pfn_t pfn; 1634 pfn_t pfn;
1284 unsigned long mmu_seq; 1635 unsigned long mmu_seq;
1285 1636
1286 down_read(&current->mm->mmap_sem);
1287 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1288 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1289 largepage = 1; 1639 largepage = 1;
1290 } 1640 }
1291 1641
1292 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1293 /* implicit mb(), we'll read before PT lock is unlocked */ 1643 smp_rmb();
1294 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1295 up_read(&current->mm->mmap_sem);
1296 1645
1297 /* mmio */ 1646 /* mmio */
1298 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1304,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1304 if (mmu_notifier_retry(vcpu, mmu_seq)) 1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1305 goto out_unlock; 1654 goto out_unlock;
1306 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1307 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1308 PT32E_ROOT_LEVEL);
1309 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1310 1658
1311 1659
@@ -1401,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1401 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1402} 1750}
1403 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1404static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1405{ 1784{
1406 return vaddr; 1785 return vaddr;
@@ -1442,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1442 if (r) 1821 if (r)
1443 return r; 1822 return r;
1444 1823
1445 down_read(&current->mm->mmap_sem);
1446 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1447 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1448 largepage = 1; 1826 largepage = 1;
1449 } 1827 }
1450 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1451 /* implicit mb(), we'll read before PT lock is unlocked */ 1829 smp_rmb();
1452 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1453 up_read(&current->mm->mmap_sem);
1454 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1455 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1456 return 1; 1833 return 1;
@@ -1460,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1460 goto out_unlock; 1837 goto out_unlock;
1461 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1462 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1463 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1464 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1465 1842
1466 return r; 1843 return r;
@@ -1485,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1485 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1486 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1487 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1488 context->root_level = 0; 1867 context->root_level = 0;
1489 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1490 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1532,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1532 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1533 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1534 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1535 context->free = paging_free; 1916 context->free = paging_free;
1536 context->root_level = level; 1917 context->root_level = level;
1537 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1553,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1553 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1554 context->free = paging_free; 1935 context->free = paging_free;
1555 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1556 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1557 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1558 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1572,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1572 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1573 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1574 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1575 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1576 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1577 1962
@@ -1643,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1643 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1644 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1645 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1646 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1647 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1648 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1763,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1763 return; 2149 return;
1764 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1765 2151
1766 down_read(&current->mm->mmap_sem);
1767 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1768 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1769 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1770 } 2155 }
1771 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1772 /* implicit mb(), we'll read before PT lock is unlocked */ 2157 smp_rmb();
1773 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1774 up_read(&current->mm->mmap_sem);
1775 2159
1776 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1777 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1833,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1833 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1834 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1835 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1836 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1837 continue; 2221 continue;
1838 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1839 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1851,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1851 */ 2235 */
1852 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1853 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1854 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1855 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1856 continue; 2241 continue;
1857 } 2242 }
@@ -1965,6 +2350,16 @@ out:
1965} 2350}
1966EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1967 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1968void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1969{ 2364{
1970 tdp_enabled = true; 2365 tdp_enabled = true;
@@ -2051,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2051{ 2446{
2052 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
2053 2448
2449 spin_lock(&kvm->mmu_lock);
2054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2055 int i; 2451 int i;
2056 u64 *pt; 2452 u64 *pt;
@@ -2064,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2064 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
2065 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
2066 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
2067} 2465}
2068 2466
2069void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2072,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2072 2470
2073 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
2074 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2075 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
2076 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
2077 2477
2078 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
@@ -2287,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2287 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2288{ 2688{
2289 int r; 2689 int r;
2290 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2291 2691
2292 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2293 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2294 buffer.processed = 0; 2694 buffer->processed = 0;
2295 2695
2296 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2297 if (r) 2697 if (r)
2298 goto out; 2698 goto out;
2299 2699
2300 while (buffer.len) { 2700 while (buffer->len) {
2301 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2302 if (r < 0) 2702 if (r < 0)
2303 goto out; 2703 goto out;
2304 if (r == 0) 2704 if (r == 0)
@@ -2307,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2307 2707
2308 r = 1; 2708 r = 1;
2309out: 2709out:
2310 *ret = buffer.processed; 2710 *ret = buffer->processed;
2311 return r; 2711 return r;
2312} 2712}
2313 2713