aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 15:01:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-24 15:01:20 -0400
commit5fecc9d8f59e765c2a48379dd7c6f5cf88c7d75a (patch)
treed1fc25d9650d3ac24591bba6f5e2e7a1afc54796 /arch/x86/kvm/mmu.c
parent3c4cfadef6a1665d9cd02a543782d03d3e6740c6 (diff)
parent1a577b72475d161b6677c05abe57301362023bb2 (diff)
Merge tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Avi Kivity: "Highlights include - full big real mode emulation on pre-Westmere Intel hosts (can be disabled with emulate_invalid_guest_state=0) - relatively small ppc and s390 updates - PCID/INVPCID support in guests - EOI avoidance; 3.6 guests should perform better on 3.6 hosts on interrupt intensive workloads) - Lockless write faults during live migration - EPT accessed/dirty bits support for new Intel processors" Fix up conflicts in: - Documentation/virtual/kvm/api.txt: Stupid subchapter numbering, added next to each other. - arch/powerpc/kvm/booke_interrupts.S: PPC asm changes clashing with the KVM fixes - arch/s390/include/asm/sigp.h, arch/s390/kvm/sigp.c: Duplicated commits through the kvm tree and the s390 tree, with subsequent edits in the KVM tree. * tag 'kvm-3.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (93 commits) KVM: fix race with level interrupts x86, hyper: fix build with !CONFIG_KVM_GUEST Revert "apic: fix kvm build on UP without IOAPIC" KVM guest: switch to apic_set_eoi_write, apic_write apic: add apic_set_eoi_write for PV use KVM: VMX: Implement PCID/INVPCID for guests with EPT KVM: Add x86_hyper_kvm to complete detect_hypervisor_platform check KVM: PPC: Critical interrupt emulation support KVM: PPC: e500mc: Fix tlbilx emulation for 64-bit guests KVM: PPC64: booke: Set interrupt computation mode for 64-bit host KVM: PPC: bookehv: Add ESR flag to Data Storage Interrupt KVM: PPC: bookehv64: Add support for std/ld emulation. booke: Added crit/mc exception handler for e500v2 booke/bookehv: Add host crit-watchdog exception support KVM: MMU: document mmu-lock and fast page fault KVM: MMU: fix kvm_mmu_pagetable_walk tracepoint KVM: MMU: trace fast page fault KVM: MMU: fast path of handling guest page fault KVM: MMU: introduce SPTE_MMU_WRITEABLE bit KVM: MMU: fold tlb flush judgement into mmu_spte_update ...
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c359
1 files changed, 261 insertions, 98 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 57e168e27b5b..01ca00423938 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,7 +90,7 @@ module_param(dbg, bool, 0644);
90 90
91#define PTE_PREFETCH_NUM 8 91#define PTE_PREFETCH_NUM 8
92 92
93#define PT_FIRST_AVAIL_BITS_SHIFT 9 93#define PT_FIRST_AVAIL_BITS_SHIFT 10
94#define PT64_SECOND_AVAIL_BITS_SHIFT 52 94#define PT64_SECOND_AVAIL_BITS_SHIFT 52
95 95
96#define PT64_LEVEL_BITS 9 96#define PT64_LEVEL_BITS 9
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
145#define CREATE_TRACE_POINTS 145#define CREATE_TRACE_POINTS
146#include "mmutrace.h" 146#include "mmutrace.h"
147 147
148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 148#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
149#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
149 150
150#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 151#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
151 152
@@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask;
188static u64 __read_mostly shadow_mmio_mask; 189static u64 __read_mostly shadow_mmio_mask;
189 190
190static void mmu_spte_set(u64 *sptep, u64 spte); 191static void mmu_spte_set(u64 *sptep, u64 spte);
192static void mmu_free_roots(struct kvm_vcpu *vcpu);
191 193
192void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 194void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
193{ 195{
@@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
444} 446}
445#endif 447#endif
446 448
449static bool spte_is_locklessly_modifiable(u64 spte)
450{
451 return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
452}
453
447static bool spte_has_volatile_bits(u64 spte) 454static bool spte_has_volatile_bits(u64 spte)
448{ 455{
456 /*
457 * Always atomicly update spte if it can be updated
458 * out of mmu-lock, it can ensure dirty bit is not lost,
459 * also, it can help us to get a stable is_writable_pte()
460 * to ensure tlb flush is not missed.
461 */
462 if (spte_is_locklessly_modifiable(spte))
463 return true;
464
449 if (!shadow_accessed_mask) 465 if (!shadow_accessed_mask)
450 return false; 466 return false;
451 467
@@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
478 494
479/* Rules for using mmu_spte_update: 495/* Rules for using mmu_spte_update:
480 * Update the state bits, it means the mapped pfn is not changged. 496 * Update the state bits, it means the mapped pfn is not changged.
497 *
498 * Whenever we overwrite a writable spte with a read-only one we
499 * should flush remote TLBs. Otherwise rmap_write_protect
500 * will find a read-only spte, even though the writable spte
501 * might be cached on a CPU's TLB, the return value indicates this
502 * case.
481 */ 503 */
482static void mmu_spte_update(u64 *sptep, u64 new_spte) 504static bool mmu_spte_update(u64 *sptep, u64 new_spte)
483{ 505{
484 u64 mask, old_spte = *sptep; 506 u64 old_spte = *sptep;
507 bool ret = false;
485 508
486 WARN_ON(!is_rmap_spte(new_spte)); 509 WARN_ON(!is_rmap_spte(new_spte));
487 510
488 if (!is_shadow_present_pte(old_spte)) 511 if (!is_shadow_present_pte(old_spte)) {
489 return mmu_spte_set(sptep, new_spte); 512 mmu_spte_set(sptep, new_spte);
490 513 return ret;
491 new_spte |= old_spte & shadow_dirty_mask; 514 }
492
493 mask = shadow_accessed_mask;
494 if (is_writable_pte(old_spte))
495 mask |= shadow_dirty_mask;
496 515
497 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 516 if (!spte_has_volatile_bits(old_spte))
498 __update_clear_spte_fast(sptep, new_spte); 517 __update_clear_spte_fast(sptep, new_spte);
499 else 518 else
500 old_spte = __update_clear_spte_slow(sptep, new_spte); 519 old_spte = __update_clear_spte_slow(sptep, new_spte);
501 520
521 /*
522 * For the spte updated out of mmu-lock is safe, since
523 * we always atomicly update it, see the comments in
524 * spte_has_volatile_bits().
525 */
526 if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
527 ret = true;
528
502 if (!shadow_accessed_mask) 529 if (!shadow_accessed_mask)
503 return; 530 return ret;
504 531
505 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 532 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
506 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 533 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
507 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 534 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
508 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 535 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
536
537 return ret;
509} 538}
510 539
511/* 540/*
@@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
652 mmu_page_header_cache); 681 mmu_page_header_cache);
653} 682}
654 683
655static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, 684static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
656 size_t size)
657{ 685{
658 void *p; 686 void *p;
659 687
@@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
664 692
665static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 693static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
666{ 694{
667 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, 695 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
668 sizeof(struct pte_list_desc));
669} 696}
670 697
671static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 698static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
@@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
1051 rmap_remove(kvm, sptep); 1078 rmap_remove(kvm, sptep);
1052} 1079}
1053 1080
1054static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) 1081
1082static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1083{
1084 if (is_large_pte(*sptep)) {
1085 WARN_ON(page_header(__pa(sptep))->role.level ==
1086 PT_PAGE_TABLE_LEVEL);
1087 drop_spte(kvm, sptep);
1088 --kvm->stat.lpages;
1089 return true;
1090 }
1091
1092 return false;
1093}
1094
1095static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1096{
1097 if (__drop_large_spte(vcpu->kvm, sptep))
1098 kvm_flush_remote_tlbs(vcpu->kvm);
1099}
1100
1101/*
1102 * Write-protect on the specified @sptep, @pt_protect indicates whether
1103 * spte writ-protection is caused by protecting shadow page table.
1104 * @flush indicates whether tlb need be flushed.
1105 *
1106 * Note: write protection is difference between drity logging and spte
1107 * protection:
1108 * - for dirty logging, the spte can be set to writable at anytime if
1109 * its dirty bitmap is properly set.
1110 * - for spte protection, the spte can be writable only after unsync-ing
1111 * shadow page.
1112 *
1113 * Return true if the spte is dropped.
1114 */
1115static bool
1116spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
1117{
1118 u64 spte = *sptep;
1119
1120 if (!is_writable_pte(spte) &&
1121 !(pt_protect && spte_is_locklessly_modifiable(spte)))
1122 return false;
1123
1124 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1125
1126 if (__drop_large_spte(kvm, sptep)) {
1127 *flush |= true;
1128 return true;
1129 }
1130
1131 if (pt_protect)
1132 spte &= ~SPTE_MMU_WRITEABLE;
1133 spte = spte & ~PT_WRITABLE_MASK;
1134
1135 *flush |= mmu_spte_update(sptep, spte);
1136 return false;
1137}
1138
1139static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
1140 int level, bool pt_protect)
1055{ 1141{
1056 u64 *sptep; 1142 u64 *sptep;
1057 struct rmap_iterator iter; 1143 struct rmap_iterator iter;
1058 int write_protected = 0; 1144 bool flush = false;
1059 1145
1060 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { 1146 for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
1061 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1147 BUG_ON(!(*sptep & PT_PRESENT_MASK));
1062 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1148 if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
1063
1064 if (!is_writable_pte(*sptep)) {
1065 sptep = rmap_get_next(&iter);
1066 continue;
1067 }
1068
1069 if (level == PT_PAGE_TABLE_LEVEL) {
1070 mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
1071 sptep = rmap_get_next(&iter);
1072 } else {
1073 BUG_ON(!is_large_pte(*sptep));
1074 drop_spte(kvm, sptep);
1075 --kvm->stat.lpages;
1076 sptep = rmap_get_first(*rmapp, &iter); 1149 sptep = rmap_get_first(*rmapp, &iter);
1150 continue;
1077 } 1151 }
1078 1152
1079 write_protected = 1; 1153 sptep = rmap_get_next(&iter);
1080 } 1154 }
1081 1155
1082 return write_protected; 1156 return flush;
1083} 1157}
1084 1158
1085/** 1159/**
@@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1100 1174
1101 while (mask) { 1175 while (mask) {
1102 rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; 1176 rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
1103 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); 1177 __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
1104 1178
1105 /* clear the first set bit */ 1179 /* clear the first set bit */
1106 mask &= mask - 1; 1180 mask &= mask - 1;
1107 } 1181 }
1108} 1182}
1109 1183
1110static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1184static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
1111{ 1185{
1112 struct kvm_memory_slot *slot; 1186 struct kvm_memory_slot *slot;
1113 unsigned long *rmapp; 1187 unsigned long *rmapp;
1114 int i; 1188 int i;
1115 int write_protected = 0; 1189 bool write_protected = false;
1116 1190
1117 slot = gfn_to_memslot(kvm, gfn); 1191 slot = gfn_to_memslot(kvm, gfn);
1118 1192
1119 for (i = PT_PAGE_TABLE_LEVEL; 1193 for (i = PT_PAGE_TABLE_LEVEL;
1120 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 1194 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
1121 rmapp = __gfn_to_rmap(gfn, i, slot); 1195 rmapp = __gfn_to_rmap(gfn, i, slot);
1122 write_protected |= __rmap_write_protect(kvm, rmapp, i); 1196 write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
1123 } 1197 }
1124 1198
1125 return write_protected; 1199 return write_protected;
@@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1238 unsigned long data) 1312 unsigned long data)
1239{ 1313{
1240 u64 *sptep; 1314 u64 *sptep;
1241 struct rmap_iterator iter; 1315 struct rmap_iterator uninitialized_var(iter);
1242 int young = 0; 1316 int young = 0;
1243 1317
1244 /* 1318 /*
1245 * Emulate the accessed bit for EPT, by checking if this page has 1319 * In case of absence of EPT Access and Dirty Bits supports,
1320 * emulate the accessed bit for EPT, by checking if this page has
1246 * an EPT mapping, and clearing it if it does. On the next access, 1321 * an EPT mapping, and clearing it if it does. On the next access,
1247 * a new EPT mapping will be established. 1322 * a new EPT mapping will be established.
1248 * This has some overhead, but not as much as the cost of swapping 1323 * This has some overhead, but not as much as the cost of swapping
@@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1253 1328
1254 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1329 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1255 sptep = rmap_get_next(&iter)) { 1330 sptep = rmap_get_next(&iter)) {
1256 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1331 BUG_ON(!is_shadow_present_pte(*sptep));
1257 1332
1258 if (*sptep & PT_ACCESSED_MASK) { 1333 if (*sptep & shadow_accessed_mask) {
1259 young = 1; 1334 young = 1;
1260 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); 1335 clear_bit((ffs(shadow_accessed_mask) - 1),
1336 (unsigned long *)sptep);
1261 } 1337 }
1262 } 1338 }
1263 1339
@@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
1281 1357
1282 for (sptep = rmap_get_first(*rmapp, &iter); sptep; 1358 for (sptep = rmap_get_first(*rmapp, &iter); sptep;
1283 sptep = rmap_get_next(&iter)) { 1359 sptep = rmap_get_next(&iter)) {
1284 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1360 BUG_ON(!is_shadow_present_pte(*sptep));
1285 1361
1286 if (*sptep & PT_ACCESSED_MASK) { 1362 if (*sptep & shadow_accessed_mask) {
1287 young = 1; 1363 young = 1;
1288 break; 1364 break;
1289 } 1365 }
@@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1401 u64 *parent_pte, int direct) 1477 u64 *parent_pte, int direct)
1402{ 1478{
1403 struct kvm_mmu_page *sp; 1479 struct kvm_mmu_page *sp;
1404 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, 1480 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
1405 sizeof *sp); 1481 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1406 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1407 if (!direct) 1482 if (!direct)
1408 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 1483 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1409 PAGE_SIZE);
1410 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1484 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1411 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1485 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1412 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); 1486 bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
@@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1701 1775
1702 kvm_mmu_pages_init(parent, &parents, &pages); 1776 kvm_mmu_pages_init(parent, &parents, &pages);
1703 while (mmu_unsync_walk(parent, &pages)) { 1777 while (mmu_unsync_walk(parent, &pages)) {
1704 int protected = 0; 1778 bool protected = false;
1705 1779
1706 for_each_sp(pages, sp, parents, i) 1780 for_each_sp(pages, sp, parents, i)
1707 protected |= rmap_write_protect(vcpu->kvm, sp->gfn); 1781 protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
@@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1866 mmu_spte_set(sptep, spte); 1940 mmu_spte_set(sptep, spte);
1867} 1941}
1868 1942
1869static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1870{
1871 if (is_large_pte(*sptep)) {
1872 drop_spte(vcpu->kvm, sptep);
1873 --vcpu->kvm->stat.lpages;
1874 kvm_flush_remote_tlbs(vcpu->kvm);
1875 }
1876}
1877
1878static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1943static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1879 unsigned direct_access) 1944 unsigned direct_access)
1880{ 1945{
@@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2243 gfn_t gfn, pfn_t pfn, bool speculative, 2308 gfn_t gfn, pfn_t pfn, bool speculative,
2244 bool can_unsync, bool host_writable) 2309 bool can_unsync, bool host_writable)
2245{ 2310{
2246 u64 spte, entry = *sptep; 2311 u64 spte;
2247 int ret = 0; 2312 int ret = 0;
2248 2313
2249 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2314 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2257 spte |= shadow_x_mask; 2322 spte |= shadow_x_mask;
2258 else 2323 else
2259 spte |= shadow_nx_mask; 2324 spte |= shadow_nx_mask;
2325
2260 if (pte_access & ACC_USER_MASK) 2326 if (pte_access & ACC_USER_MASK)
2261 spte |= shadow_user_mask; 2327 spte |= shadow_user_mask;
2328
2262 if (level > PT_PAGE_TABLE_LEVEL) 2329 if (level > PT_PAGE_TABLE_LEVEL)
2263 spte |= PT_PAGE_SIZE_MASK; 2330 spte |= PT_PAGE_SIZE_MASK;
2264 if (tdp_enabled) 2331 if (tdp_enabled)
@@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2283 goto done; 2350 goto done;
2284 } 2351 }
2285 2352
2286 spte |= PT_WRITABLE_MASK; 2353 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
2287 2354
2288 if (!vcpu->arch.mmu.direct_map 2355 if (!vcpu->arch.mmu.direct_map
2289 && !(pte_access & ACC_WRITE_MASK)) { 2356 && !(pte_access & ACC_WRITE_MASK)) {
@@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2312 __func__, gfn); 2379 __func__, gfn);
2313 ret = 1; 2380 ret = 1;
2314 pte_access &= ~ACC_WRITE_MASK; 2381 pte_access &= ~ACC_WRITE_MASK;
2315 if (is_writable_pte(spte)) 2382 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
2316 spte &= ~PT_WRITABLE_MASK;
2317 } 2383 }
2318 } 2384 }
2319 2385
@@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2321 mark_page_dirty(vcpu->kvm, gfn); 2387 mark_page_dirty(vcpu->kvm, gfn);
2322 2388
2323set_pte: 2389set_pte:
2324 mmu_spte_update(sptep, spte); 2390 if (mmu_spte_update(sptep, spte))
2325 /*
2326 * If we overwrite a writable spte with a read-only one we
2327 * should flush remote TLBs. Otherwise rmap_write_protect
2328 * will find a read-only spte, even though the writable spte
2329 * might be cached on a CPU's TLB.
2330 */
2331 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2332 kvm_flush_remote_tlbs(vcpu->kvm); 2391 kvm_flush_remote_tlbs(vcpu->kvm);
2333done: 2392done:
2334 return ret; 2393 return ret;
@@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2403 2462
2404static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 2463static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2405{ 2464{
2465 mmu_free_roots(vcpu);
2406} 2466}
2407 2467
2408static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2468static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2625,18 +2685,116 @@ exit:
2625 return ret; 2685 return ret;
2626} 2686}
2627 2687
2688static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
2689{
2690 /*
2691 * #PF can be fast only if the shadow page table is present and it
2692 * is caused by write-protect, that means we just need change the
2693 * W bit of the spte which can be done out of mmu-lock.
2694 */
2695 if (!(error_code & PFERR_PRESENT_MASK) ||
2696 !(error_code & PFERR_WRITE_MASK))
2697 return false;
2698
2699 return true;
2700}
2701
2702static bool
2703fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
2704{
2705 struct kvm_mmu_page *sp = page_header(__pa(sptep));
2706 gfn_t gfn;
2707
2708 WARN_ON(!sp->role.direct);
2709
2710 /*
2711 * The gfn of direct spte is stable since it is calculated
2712 * by sp->gfn.
2713 */
2714 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
2715
2716 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
2717 mark_page_dirty(vcpu->kvm, gfn);
2718
2719 return true;
2720}
2721
2722/*
2723 * Return value:
2724 * - true: let the vcpu to access on the same address again.
2725 * - false: let the real page fault path to fix it.
2726 */
2727static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2728 u32 error_code)
2729{
2730 struct kvm_shadow_walk_iterator iterator;
2731 bool ret = false;
2732 u64 spte = 0ull;
2733
2734 if (!page_fault_can_be_fast(vcpu, error_code))
2735 return false;
2736
2737 walk_shadow_page_lockless_begin(vcpu);
2738 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
2739 if (!is_shadow_present_pte(spte) || iterator.level < level)
2740 break;
2741
2742 /*
2743 * If the mapping has been changed, let the vcpu fault on the
2744 * same address again.
2745 */
2746 if (!is_rmap_spte(spte)) {
2747 ret = true;
2748 goto exit;
2749 }
2750
2751 if (!is_last_spte(spte, level))
2752 goto exit;
2753
2754 /*
2755 * Check if it is a spurious fault caused by TLB lazily flushed.
2756 *
2757 * Need not check the access of upper level table entries since
2758 * they are always ACC_ALL.
2759 */
2760 if (is_writable_pte(spte)) {
2761 ret = true;
2762 goto exit;
2763 }
2764
2765 /*
2766 * Currently, to simplify the code, only the spte write-protected
2767 * by dirty-log can be fast fixed.
2768 */
2769 if (!spte_is_locklessly_modifiable(spte))
2770 goto exit;
2771
2772 /*
2773 * Currently, fast page fault only works for direct mapping since
2774 * the gfn is not stable for indirect shadow page.
2775 * See Documentation/virtual/kvm/locking.txt to get more detail.
2776 */
2777 ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
2778exit:
2779 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
2780 spte, ret);
2781 walk_shadow_page_lockless_end(vcpu);
2782
2783 return ret;
2784}
2785
2628static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2786static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2629 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2787 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2630 2788
2631static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, 2789static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
2632 bool prefault) 2790 gfn_t gfn, bool prefault)
2633{ 2791{
2634 int r; 2792 int r;
2635 int level; 2793 int level;
2636 int force_pt_level; 2794 int force_pt_level;
2637 pfn_t pfn; 2795 pfn_t pfn;
2638 unsigned long mmu_seq; 2796 unsigned long mmu_seq;
2639 bool map_writable; 2797 bool map_writable, write = error_code & PFERR_WRITE_MASK;
2640 2798
2641 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); 2799 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2642 if (likely(!force_pt_level)) { 2800 if (likely(!force_pt_level)) {
@@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2653 } else 2811 } else
2654 level = PT_PAGE_TABLE_LEVEL; 2812 level = PT_PAGE_TABLE_LEVEL;
2655 2813
2814 if (fast_page_fault(vcpu, v, level, error_code))
2815 return 0;
2816
2656 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2817 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2657 smp_rmb(); 2818 smp_rmb();
2658 2819
@@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3041 gfn = gva >> PAGE_SHIFT; 3202 gfn = gva >> PAGE_SHIFT;
3042 3203
3043 return nonpaging_map(vcpu, gva & PAGE_MASK, 3204 return nonpaging_map(vcpu, gva & PAGE_MASK,
3044 error_code & PFERR_WRITE_MASK, gfn, prefault); 3205 error_code, gfn, prefault);
3045} 3206}
3046 3207
3047static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3208static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3121 } else 3282 } else
3122 level = PT_PAGE_TABLE_LEVEL; 3283 level = PT_PAGE_TABLE_LEVEL;
3123 3284
3285 if (fast_page_fault(vcpu, gpa, level, error_code))
3286 return 0;
3287
3124 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3288 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3125 smp_rmb(); 3289 smp_rmb();
3126 3290
@@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3885void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 4049void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3886{ 4050{
3887 struct kvm_mmu_page *sp; 4051 struct kvm_mmu_page *sp;
4052 bool flush = false;
3888 4053
3889 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 4054 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
3890 int i; 4055 int i;
@@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3899 !is_last_spte(pt[i], sp->role.level)) 4064 !is_last_spte(pt[i], sp->role.level))
3900 continue; 4065 continue;
3901 4066
3902 if (is_large_pte(pt[i])) { 4067 spte_write_protect(kvm, &pt[i], &flush, false);
3903 drop_spte(kvm, &pt[i]);
3904 --kvm->stat.lpages;
3905 continue;
3906 }
3907
3908 /* avoid RMW */
3909 if (is_writable_pte(pt[i]))
3910 mmu_spte_update(&pt[i],
3911 pt[i] & ~PT_WRITABLE_MASK);
3912 } 4068 }
3913 } 4069 }
3914 kvm_flush_remote_tlbs(kvm); 4070 kvm_flush_remote_tlbs(kvm);
@@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3945static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4101static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3946{ 4102{
3947 struct kvm *kvm; 4103 struct kvm *kvm;
3948 struct kvm *kvm_freed = NULL;
3949 int nr_to_scan = sc->nr_to_scan; 4104 int nr_to_scan = sc->nr_to_scan;
3950 4105
3951 if (nr_to_scan == 0) 4106 if (nr_to_scan == 0)
@@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3957 int idx; 4112 int idx;
3958 LIST_HEAD(invalid_list); 4113 LIST_HEAD(invalid_list);
3959 4114
4115 /*
4116 * n_used_mmu_pages is accessed without holding kvm->mmu_lock
4117 * here. We may skip a VM instance errorneosly, but we do not
4118 * want to shrink a VM that only started to populate its MMU
4119 * anyway.
4120 */
4121 if (kvm->arch.n_used_mmu_pages > 0) {
4122 if (!nr_to_scan--)
4123 break;
4124 continue;
4125 }
4126
3960 idx = srcu_read_lock(&kvm->srcu); 4127 idx = srcu_read_lock(&kvm->srcu);
3961 spin_lock(&kvm->mmu_lock); 4128 spin_lock(&kvm->mmu_lock);
3962 if (!kvm_freed && nr_to_scan > 0 &&
3963 kvm->arch.n_used_mmu_pages > 0) {
3964 kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3965 &invalid_list);
3966 kvm_freed = kvm;
3967 }
3968 nr_to_scan--;
3969 4129
4130 kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list);
3970 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4131 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4132
3971 spin_unlock(&kvm->mmu_lock); 4133 spin_unlock(&kvm->mmu_lock);
3972 srcu_read_unlock(&kvm->srcu, idx); 4134 srcu_read_unlock(&kvm->srcu, idx);
4135
4136 list_move_tail(&kvm->vm_list, &vm_list);
4137 break;
3973 } 4138 }
3974 if (kvm_freed)
3975 list_move_tail(&kvm_freed->vm_list, &vm_list);
3976 4139
3977 raw_spin_unlock(&kvm_lock); 4140 raw_spin_unlock(&kvm_lock);
3978 4141