aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c225
1 files changed, 114 insertions, 111 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa..81563e76e28 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
148 148
149#include <trace/events/kvm.h> 149#include <trace/events/kvm.h>
150 150
151#undef TRACE_INCLUDE_FILE
152#define CREATE_TRACE_POINTS 151#define CREATE_TRACE_POINTS
153#include "mmutrace.h" 152#include "mmutrace.h"
154 153
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
174 shadow_walk_okay(&(_walker)); \ 173 shadow_walk_okay(&(_walker)); \
175 shadow_walk_next(&(_walker))) 174 shadow_walk_next(&(_walker)))
176 175
177 176typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
178struct kvm_unsync_walk {
179 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
180};
181
182typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
183 177
184static struct kmem_cache *pte_chain_cache; 178static struct kmem_cache *pte_chain_cache;
185static struct kmem_cache *rmap_desc_cache; 179static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
223} 217}
224EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 218EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
225 219
226static int is_write_protection(struct kvm_vcpu *vcpu) 220static bool is_write_protection(struct kvm_vcpu *vcpu)
227{ 221{
228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP); 222 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
229} 223}
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
327 page = alloc_page(GFP_KERNEL); 321 page = alloc_page(GFP_KERNEL);
328 if (!page) 322 if (!page)
329 return -ENOMEM; 323 return -ENOMEM;
330 set_page_private(page, 0);
331 cache->objects[cache->nobjs++] = page_address(page); 324 cache->objects[cache->nobjs++] = page_address(page);
332 } 325 }
333 return 0; 326 return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
438 int i; 431 int i;
439 432
440 gfn = unalias_gfn(kvm, gfn); 433 gfn = unalias_gfn(kvm, gfn);
434 slot = gfn_to_memslot_unaliased(kvm, gfn);
441 for (i = PT_DIRECTORY_LEVEL; 435 for (i = PT_DIRECTORY_LEVEL;
442 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 436 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
443 slot = gfn_to_memslot_unaliased(kvm, gfn);
444 write_count = slot_largepage_idx(gfn, slot, i); 437 write_count = slot_largepage_idx(gfn, slot, i);
445 *write_count -= 1; 438 *write_count -= 1;
446 WARN_ON(*write_count < 0); 439 WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
654static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 647static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
655{ 648{
656 struct kvm_rmap_desc *desc; 649 struct kvm_rmap_desc *desc;
657 struct kvm_rmap_desc *prev_desc;
658 u64 *prev_spte; 650 u64 *prev_spte;
659 int i; 651 int i;
660 652
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
666 return NULL; 658 return NULL;
667 } 659 }
668 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
669 prev_desc = NULL;
670 prev_spte = NULL; 661 prev_spte = NULL;
671 while (desc) { 662 while (desc) {
672 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 663 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
794 int retval = 0; 785 int retval = 0;
795 struct kvm_memslots *slots; 786 struct kvm_memslots *slots;
796 787
797 slots = rcu_dereference(kvm->memslots); 788 slots = kvm_memslots(kvm);
798 789
799 for (i = 0; i < slots->nmemslots; i++) { 790 for (i = 0; i < slots->nmemslots; i++) {
800 struct kvm_memory_slot *memslot = &slots->memslots[i]; 791 struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
925 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
926 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 917 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
927 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 918 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
928 INIT_LIST_HEAD(&sp->oos_link);
929 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 919 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
930 sp->multimapped = 0; 920 sp->multimapped = 0;
931 sp->parent_pte = parent_pte; 921 sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1009} 999}
1010 1000
1011 1001
1012static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1002static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
1013 mmu_parent_walk_fn fn)
1014{ 1003{
1015 struct kvm_pte_chain *pte_chain; 1004 struct kvm_pte_chain *pte_chain;
1016 struct hlist_node *node; 1005 struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1019 1008
1020 if (!sp->multimapped && sp->parent_pte) { 1009 if (!sp->multimapped && sp->parent_pte) {
1021 parent_sp = page_header(__pa(sp->parent_pte)); 1010 parent_sp = page_header(__pa(sp->parent_pte));
1022 fn(vcpu, parent_sp); 1011 fn(parent_sp);
1023 mmu_parent_walk(vcpu, parent_sp, fn); 1012 mmu_parent_walk(parent_sp, fn);
1024 return; 1013 return;
1025 } 1014 }
1026 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1028 if (!pte_chain->parent_ptes[i]) 1017 if (!pte_chain->parent_ptes[i])
1029 break; 1018 break;
1030 parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1019 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
1031 fn(vcpu, parent_sp); 1020 fn(parent_sp);
1032 mmu_parent_walk(vcpu, parent_sp, fn); 1021 mmu_parent_walk(parent_sp, fn);
1033 } 1022 }
1034} 1023}
1035 1024
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
1066 } 1055 }
1067} 1056}
1068 1057
1069static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1058static int unsync_walk_fn(struct kvm_mmu_page *sp)
1070{ 1059{
1071 kvm_mmu_update_parents_unsync(sp); 1060 kvm_mmu_update_parents_unsync(sp);
1072 return 1; 1061 return 1;
1073} 1062}
1074 1063
1075static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, 1064static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1076 struct kvm_mmu_page *sp)
1077{ 1065{
1078 mmu_parent_walk(vcpu, sp, unsync_walk_fn); 1066 mmu_parent_walk(sp, unsync_walk_fn);
1079 kvm_mmu_update_parents_unsync(sp); 1067 kvm_mmu_update_parents_unsync(sp);
1080} 1068}
1081 1069
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
1201static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1189static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1202{ 1190{
1203 WARN_ON(!sp->unsync); 1191 WARN_ON(!sp->unsync);
1192 trace_kvm_mmu_sync_page(sp);
1204 sp->unsync = 0; 1193 sp->unsync = 0;
1205 --kvm->stat.mmu_unsync; 1194 --kvm->stat.mmu_unsync;
1206} 1195}
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1209 1198
1210static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1199static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1211{ 1200{
1212 if (sp->role.glevels != vcpu->arch.mmu.root_level) { 1201 if (sp->role.cr4_pae != !!is_pae(vcpu)) {
1213 kvm_mmu_zap_page(vcpu->kvm, sp); 1202 kvm_mmu_zap_page(vcpu->kvm, sp);
1214 return 1; 1203 return 1;
1215 } 1204 }
1216 1205
1217 trace_kvm_mmu_sync_page(sp);
1218 if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1206 if (rmap_write_protect(vcpu->kvm, sp->gfn))
1219 kvm_flush_remote_tlbs(vcpu->kvm); 1207 kvm_flush_remote_tlbs(vcpu->kvm);
1220 kvm_unlink_unsync_page(vcpu->kvm, sp); 1208 kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1331 role = vcpu->arch.mmu.base_role; 1319 role = vcpu->arch.mmu.base_role;
1332 role.level = level; 1320 role.level = level;
1333 role.direct = direct; 1321 role.direct = direct;
1322 if (role.direct)
1323 role.cr4_pae = 0;
1334 role.access = access; 1324 role.access = access;
1335 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1336 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1351 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1352 if (sp->unsync_children) { 1342 if (sp->unsync_children) {
1353 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1343 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1354 kvm_mmu_mark_parents_unsync(vcpu, sp); 1344 kvm_mmu_mark_parents_unsync(sp);
1355 } 1345 }
1356 trace_kvm_mmu_get_page(sp, false); 1346 trace_kvm_mmu_get_page(sp, false);
1357 return sp; 1347 return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1573 r = 0; 1563 r = 0;
1574 index = kvm_page_table_hashfn(gfn); 1564 index = kvm_page_table_hashfn(gfn);
1575 bucket = &kvm->arch.mmu_page_hash[index]; 1565 bucket = &kvm->arch.mmu_page_hash[index];
1566restart:
1576 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1567 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
1577 if (sp->gfn == gfn && !sp->role.direct) { 1568 if (sp->gfn == gfn && !sp->role.direct) {
1578 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1569 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1579 sp->role.word); 1570 sp->role.word);
1580 r = 1; 1571 r = 1;
1581 if (kvm_mmu_zap_page(kvm, sp)) 1572 if (kvm_mmu_zap_page(kvm, sp))
1582 n = bucket->first; 1573 goto restart;
1583 } 1574 }
1584 return r; 1575 return r;
1585} 1576}
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1593 1584
1594 index = kvm_page_table_hashfn(gfn); 1585 index = kvm_page_table_hashfn(gfn);
1595 bucket = &kvm->arch.mmu_page_hash[index]; 1586 bucket = &kvm->arch.mmu_page_hash[index];
1587restart:
1596 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1588 hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
1597 if (sp->gfn == gfn && !sp->role.direct 1589 if (sp->gfn == gfn && !sp->role.direct
1598 && !sp->role.invalid) { 1590 && !sp->role.invalid) {
1599 pgprintk("%s: zap %lx %x\n", 1591 pgprintk("%s: zap %lx %x\n",
1600 __func__, gfn, sp->role.word); 1592 __func__, gfn, sp->role.word);
1601 if (kvm_mmu_zap_page(kvm, sp)) 1593 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first; 1594 goto restart;
1603 } 1595 }
1604 } 1596 }
1605} 1597}
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1626 } 1618 }
1627} 1619}
1628 1620
1629struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1630{
1631 struct page *page;
1632
1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1634
1635 if (gpa == UNMAPPED_GVA)
1636 return NULL;
1637
1638 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1639
1640 return page;
1641}
1642
1643/* 1621/*
1644 * The function is based on mtrr_type_lookup() in 1622 * The function is based on mtrr_type_lookup() in
1645 * arch/x86/kernel/cpu/mtrr/generic.c 1623 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1752 struct kvm_mmu_page *s; 1730 struct kvm_mmu_page *s;
1753 struct hlist_node *node, *n; 1731 struct hlist_node *node, *n;
1754 1732
1755 trace_kvm_mmu_unsync_page(sp);
1756 index = kvm_page_table_hashfn(sp->gfn); 1733 index = kvm_page_table_hashfn(sp->gfn);
1757 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1734 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1758 /* don't unsync if pagetable is shadowed with multiple roles */ 1735 /* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1762 if (s->role.word != sp->role.word) 1739 if (s->role.word != sp->role.word)
1763 return 1; 1740 return 1;
1764 } 1741 }
1742 trace_kvm_mmu_unsync_page(sp);
1765 ++vcpu->kvm->stat.mmu_unsync; 1743 ++vcpu->kvm->stat.mmu_unsync;
1766 sp->unsync = 1; 1744 sp->unsync = 1;
1767 1745
1768 kvm_mmu_mark_parents_unsync(vcpu, sp); 1746 kvm_mmu_mark_parents_unsync(sp);
1769 1747
1770 mmu_convert_notrap(sp); 1748 mmu_convert_notrap(sp);
1771 return 0; 1749 return 0;
@@ -2081,21 +2059,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2081 hpa_t root = vcpu->arch.mmu.root_hpa; 2059 hpa_t root = vcpu->arch.mmu.root_hpa;
2082 2060
2083 ASSERT(!VALID_PAGE(root)); 2061 ASSERT(!VALID_PAGE(root));
2084 if (tdp_enabled)
2085 direct = 1;
2086 if (mmu_check_root(vcpu, root_gfn)) 2062 if (mmu_check_root(vcpu, root_gfn))
2087 return 1; 2063 return 1;
2064 if (tdp_enabled) {
2065 direct = 1;
2066 root_gfn = 0;
2067 }
2068 spin_lock(&vcpu->kvm->mmu_lock);
2088 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2069 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
2089 PT64_ROOT_LEVEL, direct, 2070 PT64_ROOT_LEVEL, direct,
2090 ACC_ALL, NULL); 2071 ACC_ALL, NULL);
2091 root = __pa(sp->spt); 2072 root = __pa(sp->spt);
2092 ++sp->root_count; 2073 ++sp->root_count;
2074 spin_unlock(&vcpu->kvm->mmu_lock);
2093 vcpu->arch.mmu.root_hpa = root; 2075 vcpu->arch.mmu.root_hpa = root;
2094 return 0; 2076 return 0;
2095 } 2077 }
2096 direct = !is_paging(vcpu); 2078 direct = !is_paging(vcpu);
2097 if (tdp_enabled)
2098 direct = 1;
2099 for (i = 0; i < 4; ++i) { 2079 for (i = 0; i < 4; ++i) {
2100 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2080 hpa_t root = vcpu->arch.mmu.pae_root[i];
2101 2081
@@ -2111,11 +2091,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2111 root_gfn = 0; 2091 root_gfn = 0;
2112 if (mmu_check_root(vcpu, root_gfn)) 2092 if (mmu_check_root(vcpu, root_gfn))
2113 return 1; 2093 return 1;
2094 if (tdp_enabled) {
2095 direct = 1;
2096 root_gfn = i << 30;
2097 }
2098 spin_lock(&vcpu->kvm->mmu_lock);
2114 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2099 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2115 PT32_ROOT_LEVEL, direct, 2100 PT32_ROOT_LEVEL, direct,
2116 ACC_ALL, NULL); 2101 ACC_ALL, NULL);
2117 root = __pa(sp->spt); 2102 root = __pa(sp->spt);
2118 ++sp->root_count; 2103 ++sp->root_count;
2104 spin_unlock(&vcpu->kvm->mmu_lock);
2105
2119 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2106 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2120 } 2107 }
2121 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2108 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2286,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2299 /* no rsvd bits for 2 level 4K page table entries */ 2286 /* no rsvd bits for 2 level 4K page table entries */
2300 context->rsvd_bits_mask[0][1] = 0; 2287 context->rsvd_bits_mask[0][1] = 0;
2301 context->rsvd_bits_mask[0][0] = 0; 2288 context->rsvd_bits_mask[0][0] = 0;
2289 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2290
2291 if (!is_pse(vcpu)) {
2292 context->rsvd_bits_mask[1][1] = 0;
2293 break;
2294 }
2295
2302 if (is_cpuid_PSE36()) 2296 if (is_cpuid_PSE36())
2303 /* 36bits PSE 4MB page */ 2297 /* 36bits PSE 4MB page */
2304 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 2298 context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
2305 else 2299 else
2306 /* 32 bits PSE 4MB page */ 2300 /* 32 bits PSE 4MB page */
2307 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 2301 context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
2308 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
2309 break; 2302 break;
2310 case PT32E_ROOT_LEVEL: 2303 case PT32E_ROOT_LEVEL:
2311 context->rsvd_bits_mask[0][2] = 2304 context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2311,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2318 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2311 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2319 rsvd_bits(maxphyaddr, 62) | 2312 rsvd_bits(maxphyaddr, 62) |
2320 rsvd_bits(13, 20); /* large page */ 2313 rsvd_bits(13, 20); /* large page */
2321 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2314 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2322 break; 2315 break;
2323 case PT64_ROOT_LEVEL: 2316 case PT64_ROOT_LEVEL:
2324 context->rsvd_bits_mask[0][3] = exb_bit_rsvd | 2317 context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2329,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2336 context->rsvd_bits_mask[1][1] = exb_bit_rsvd | 2329 context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
2337 rsvd_bits(maxphyaddr, 51) | 2330 rsvd_bits(maxphyaddr, 51) |
2338 rsvd_bits(13, 20); /* large page */ 2331 rsvd_bits(13, 20); /* large page */
2339 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; 2332 context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
2340 break; 2333 break;
2341 } 2334 }
2342} 2335}
@@ -2438,7 +2431,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2438 else 2431 else
2439 r = paging32_init_context(vcpu); 2432 r = paging32_init_context(vcpu);
2440 2433
2441 vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; 2434 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2435 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2442 2436
2443 return r; 2437 return r;
2444} 2438}
@@ -2478,7 +2472,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2478 goto out; 2472 goto out;
2479 spin_lock(&vcpu->kvm->mmu_lock); 2473 spin_lock(&vcpu->kvm->mmu_lock);
2480 kvm_mmu_free_some_pages(vcpu); 2474 kvm_mmu_free_some_pages(vcpu);
2475 spin_unlock(&vcpu->kvm->mmu_lock);
2481 r = mmu_alloc_roots(vcpu); 2476 r = mmu_alloc_roots(vcpu);
2477 spin_lock(&vcpu->kvm->mmu_lock);
2482 mmu_sync_roots(vcpu); 2478 mmu_sync_roots(vcpu);
2483 spin_unlock(&vcpu->kvm->mmu_lock); 2479 spin_unlock(&vcpu->kvm->mmu_lock);
2484 if (r) 2480 if (r)
@@ -2527,7 +2523,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2527 } 2523 }
2528 2524
2529 ++vcpu->kvm->stat.mmu_pte_updated; 2525 ++vcpu->kvm->stat.mmu_pte_updated;
2530 if (sp->role.glevels == PT32_ROOT_LEVEL) 2526 if (!sp->role.cr4_pae)
2531 paging32_update_pte(vcpu, sp, spte, new); 2527 paging32_update_pte(vcpu, sp, spte, new);
2532 else 2528 else
2533 paging64_update_pte(vcpu, sp, spte, new); 2529 paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2558,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2562} 2558}
2563 2559
2564static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2560static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2565 const u8 *new, int bytes) 2561 u64 gpte)
2566{ 2562{
2567 gfn_t gfn; 2563 gfn_t gfn;
2568 int r;
2569 u64 gpte = 0;
2570 pfn_t pfn; 2564 pfn_t pfn;
2571 2565
2572 if (bytes != 4 && bytes != 8)
2573 return;
2574
2575 /*
2576 * Assume that the pte write on a page table of the same type
2577 * as the current vcpu paging mode. This is nearly always true
2578 * (might be false while changing modes). Note it is verified later
2579 * by update_pte().
2580 */
2581 if (is_pae(vcpu)) {
2582 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2583 if ((bytes == 4) && (gpa % 4 == 0)) {
2584 r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
2585 if (r)
2586 return;
2587 memcpy((void *)&gpte + (gpa % 8), new, 4);
2588 } else if ((bytes == 8) && (gpa % 8 == 0)) {
2589 memcpy((void *)&gpte, new, 8);
2590 }
2591 } else {
2592 if ((bytes == 4) && (gpa % 4 == 0))
2593 memcpy((void *)&gpte, new, 4);
2594 }
2595 if (!is_present_gpte(gpte)) 2566 if (!is_present_gpte(gpte))
2596 return; 2567 return;
2597 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2568 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2611,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2640 int flooded = 0; 2611 int flooded = 0;
2641 int npte; 2612 int npte;
2642 int r; 2613 int r;
2614 int invlpg_counter;
2643 2615
2644 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2616 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2645 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2617
2618 invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
2619
2620 /*
2621 * Assume that the pte write on a page table of the same type
2622 * as the current vcpu paging mode. This is nearly always true
2623 * (might be false while changing modes). Note it is verified later
2624 * by update_pte().
2625 */
2626 if ((is_pae(vcpu) && bytes == 4) || !new) {
2627 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
2628 if (is_pae(vcpu)) {
2629 gpa &= ~(gpa_t)7;
2630 bytes = 8;
2631 }
2632 r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
2633 if (r)
2634 gentry = 0;
2635 new = (const u8 *)&gentry;
2636 }
2637
2638 switch (bytes) {
2639 case 4:
2640 gentry = *(const u32 *)new;
2641 break;
2642 case 8:
2643 gentry = *(const u64 *)new;
2644 break;
2645 default:
2646 gentry = 0;
2647 break;
2648 }
2649
2650 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2646 spin_lock(&vcpu->kvm->mmu_lock); 2651 spin_lock(&vcpu->kvm->mmu_lock);
2652 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2653 gentry = 0;
2647 kvm_mmu_access_page(vcpu, gfn); 2654 kvm_mmu_access_page(vcpu, gfn);
2648 kvm_mmu_free_some_pages(vcpu); 2655 kvm_mmu_free_some_pages(vcpu);
2649 ++vcpu->kvm->stat.mmu_pte_write; 2656 ++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2669,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2662 } 2669 }
2663 index = kvm_page_table_hashfn(gfn); 2670 index = kvm_page_table_hashfn(gfn);
2664 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2671 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
2672
2673restart:
2665 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2674 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
2666 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2675 if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
2667 continue; 2676 continue;
2668 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2677 pte_size = sp->role.cr4_pae ? 8 : 4;
2669 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2678 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
2670 misaligned |= bytes < 4; 2679 misaligned |= bytes < 4;
2671 if (misaligned || flooded) { 2680 if (misaligned || flooded) {
@@ -2682,14 +2691,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2682 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2691 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
2683 gpa, bytes, sp->role.word); 2692 gpa, bytes, sp->role.word);
2684 if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2693 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2685 n = bucket->first; 2694 goto restart;
2686 ++vcpu->kvm->stat.mmu_flooded; 2695 ++vcpu->kvm->stat.mmu_flooded;
2687 continue; 2696 continue;
2688 } 2697 }
2689 page_offset = offset; 2698 page_offset = offset;
2690 level = sp->role.level; 2699 level = sp->role.level;
2691 npte = 1; 2700 npte = 1;
2692 if (sp->role.glevels == PT32_ROOT_LEVEL) { 2701 if (!sp->role.cr4_pae) {
2693 page_offset <<= 1; /* 32->64 */ 2702 page_offset <<= 1; /* 32->64 */
2694 /* 2703 /*
2695 * A 32-bit pde maps 4MB while the shadow pdes map 2704 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2716,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2707 continue; 2716 continue;
2708 } 2717 }
2709 spte = &sp->spt[page_offset / sizeof(*spte)]; 2718 spte = &sp->spt[page_offset / sizeof(*spte)];
2710 if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
2711 gentry = 0;
2712 r = kvm_read_guest_atomic(vcpu->kvm,
2713 gpa & ~(u64)(pte_size - 1),
2714 &gentry, pte_size);
2715 new = (const void *)&gentry;
2716 if (r < 0)
2717 new = NULL;
2718 }
2719 while (npte--) { 2719 while (npte--) {
2720 entry = *spte; 2720 entry = *spte;
2721 mmu_pte_write_zap_pte(vcpu, sp, spte); 2721 mmu_pte_write_zap_pte(vcpu, sp, spte);
2722 if (new) 2722 if (gentry)
2723 mmu_pte_write_new_pte(vcpu, sp, spte, new); 2723 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2724 mmu_pte_write_flush_tlb(vcpu, entry, *spte);
2725 ++spte; 2725 ++spte;
2726 } 2726 }
@@ -2900,22 +2900,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2900 struct kvm_mmu_page *sp, *node; 2900 struct kvm_mmu_page *sp, *node;
2901 2901
2902 spin_lock(&kvm->mmu_lock); 2902 spin_lock(&kvm->mmu_lock);
2903restart:
2903 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2904 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2904 if (kvm_mmu_zap_page(kvm, sp)) 2905 if (kvm_mmu_zap_page(kvm, sp))
2905 node = container_of(kvm->arch.active_mmu_pages.next, 2906 goto restart;
2906 struct kvm_mmu_page, link); 2907
2907 spin_unlock(&kvm->mmu_lock); 2908 spin_unlock(&kvm->mmu_lock);
2908 2909
2909 kvm_flush_remote_tlbs(kvm); 2910 kvm_flush_remote_tlbs(kvm);
2910} 2911}
2911 2912
2912static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2913static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
2913{ 2914{
2914 struct kvm_mmu_page *page; 2915 struct kvm_mmu_page *page;
2915 2916
2916 page = container_of(kvm->arch.active_mmu_pages.prev, 2917 page = container_of(kvm->arch.active_mmu_pages.prev,
2917 struct kvm_mmu_page, link); 2918 struct kvm_mmu_page, link);
2918 kvm_mmu_zap_page(kvm, page); 2919 return kvm_mmu_zap_page(kvm, page) + 1;
2919} 2920}
2920 2921
2921static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) 2922static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2928,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2927 spin_lock(&kvm_lock); 2928 spin_lock(&kvm_lock);
2928 2929
2929 list_for_each_entry(kvm, &vm_list, vm_list) { 2930 list_for_each_entry(kvm, &vm_list, vm_list) {
2930 int npages, idx; 2931 int npages, idx, freed_pages;
2931 2932
2932 idx = srcu_read_lock(&kvm->srcu); 2933 idx = srcu_read_lock(&kvm->srcu);
2933 spin_lock(&kvm->mmu_lock); 2934 spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2936,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2935 kvm->arch.n_free_mmu_pages; 2936 kvm->arch.n_free_mmu_pages;
2936 cache_count += npages; 2937 cache_count += npages;
2937 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 2938 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
2938 kvm_mmu_remove_one_alloc_mmu_page(kvm); 2939 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
2939 cache_count--; 2940 cache_count -= freed_pages;
2940 kvm_freed = kvm; 2941 kvm_freed = kvm;
2941 } 2942 }
2942 nr_to_scan--; 2943 nr_to_scan--;
@@ -3011,7 +3012,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3011 unsigned int nr_pages = 0; 3012 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots; 3013 struct kvm_memslots *slots;
3013 3014
3014 slots = rcu_dereference(kvm->memslots); 3015 slots = kvm_memslots(kvm);
3016
3015 for (i = 0; i < slots->nmemslots; i++) 3017 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages; 3018 nr_pages += slots->memslots[i].npages;
3017 3019
@@ -3174,8 +3176,7 @@ static gva_t canonicalize(gva_t gva)
3174} 3176}
3175 3177
3176 3178
3177typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, 3179typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3178 u64 *sptep);
3179 3180
3180static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, 3181static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3181 inspect_spte_fn fn) 3182 inspect_spte_fn fn)
@@ -3191,7 +3192,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3191 child = page_header(ent & PT64_BASE_ADDR_MASK); 3192 child = page_header(ent & PT64_BASE_ADDR_MASK);
3192 __mmu_spte_walk(kvm, child, fn); 3193 __mmu_spte_walk(kvm, child, fn);
3193 } else 3194 } else
3194 fn(kvm, sp, &sp->spt[i]); 3195 fn(kvm, &sp->spt[i]);
3195 } 3196 }
3196 } 3197 }
3197} 3198}
@@ -3282,11 +3283,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3282 3283
3283static int count_rmaps(struct kvm_vcpu *vcpu) 3284static int count_rmaps(struct kvm_vcpu *vcpu)
3284{ 3285{
3286 struct kvm *kvm = vcpu->kvm;
3287 struct kvm_memslots *slots;
3285 int nmaps = 0; 3288 int nmaps = 0;
3286 int i, j, k, idx; 3289 int i, j, k, idx;
3287 3290
3288 idx = srcu_read_lock(&kvm->srcu); 3291 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots); 3292 slots = kvm_memslots(kvm);
3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3293 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3291 struct kvm_memory_slot *m = &slots->memslots[i]; 3294 struct kvm_memory_slot *m = &slots->memslots[i];
3292 struct kvm_rmap_desc *d; 3295 struct kvm_rmap_desc *d;
@@ -3315,7 +3318,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3315 return nmaps; 3318 return nmaps;
3316} 3319}
3317 3320
3318void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) 3321void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3319{ 3322{
3320 unsigned long *rmapp; 3323 unsigned long *rmapp;
3321 struct kvm_mmu_page *rev_sp; 3324 struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3334,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
3331 printk(KERN_ERR "%s: no memslot for gfn %ld\n", 3334 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3332 audit_msg, gfn); 3335 audit_msg, gfn);
3333 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", 3336 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3334 audit_msg, sptep - rev_sp->spt, 3337 audit_msg, (long int)(sptep - rev_sp->spt),
3335 rev_sp->gfn); 3338 rev_sp->gfn);
3336 dump_stack(); 3339 dump_stack();
3337 return; 3340 return;
3338 } 3341 }
3339 3342
3340 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3343 rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
3341 is_large_pte(*sptep)); 3344 rev_sp->role.level);
3342 if (!*rmapp) { 3345 if (!*rmapp) {
3343 if (!printk_ratelimit()) 3346 if (!printk_ratelimit())
3344 return; 3347 return;
@@ -3373,7 +3376,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3373 continue; 3376 continue;
3374 if (!(ent & PT_WRITABLE_MASK)) 3377 if (!(ent & PT_WRITABLE_MASK))
3375 continue; 3378 continue;
3376 inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); 3379 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3377 } 3380 }
3378 } 3381 }
3379 return; 3382 return;