aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJunaid Shahid <junaids@google.com>2016-12-06 19:46:16 -0500
committerRadim Krčmář <rkrcmar@redhat.com>2017-01-09 08:46:11 -0500
commitf160c7b7bb322bf079a5bb4dd34c58f17553f193 (patch)
treedf8f36cae081baeec8859cc367175d6dcaf36c85
parent37f0e8fe6b10ee2ab52576caa721ee1282de74a6 (diff)
kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.
This change implements lockless access tracking for Intel CPUs without EPT A bits. This is achieved by marking the PTEs as not-present (but not completely clearing them) when clear_flush_young() is called after marking the pages as accessed. When an EPT Violation is generated as a result of the VM accessing those pages, the PTEs are restored to their original values. Signed-off-by: Junaid Shahid <junaids@google.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/vmx.h9
-rw-r--r--arch/x86/kvm/mmu.c279
-rw-r--r--arch/x86/kvm/vmx.c26
-rw-r--r--arch/x86/kvm/x86.c2
5 files changed, 239 insertions, 80 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3272a5e4aaad..99a71d90b6ae 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1064,7 +1064,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
1064void kvm_mmu_init_vm(struct kvm *kvm); 1064void kvm_mmu_init_vm(struct kvm *kvm);
1065void kvm_mmu_uninit_vm(struct kvm *kvm); 1065void kvm_mmu_uninit_vm(struct kvm *kvm);
1066void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1066void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
1067 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); 1067 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
1068 u64 acc_track_mask);
1068 1069
1069void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1070void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
1070void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 1071void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fc061cbb46e0..a22a4790f1ac 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -469,11 +469,14 @@ enum vmcs_field {
469#define VMX_EPT_IPAT_BIT (1ull << 6) 469#define VMX_EPT_IPAT_BIT (1ull << 6)
470#define VMX_EPT_ACCESS_BIT (1ull << 8) 470#define VMX_EPT_ACCESS_BIT (1ull << 8)
471#define VMX_EPT_DIRTY_BIT (1ull << 9) 471#define VMX_EPT_DIRTY_BIT (1ull << 9)
472#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
473 VMX_EPT_WRITABLE_MASK | \
474 VMX_EPT_EXECUTABLE_MASK)
475#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
472 476
473/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ 477/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
474#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ 478#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
475 VMX_EPT_EXECUTABLE_MASK) 479 VMX_EPT_EXECUTABLE_MASK)
476
477 480
478#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 481#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
479 482
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b8b5259c8ebb..64821ca3a7c3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -38,6 +38,7 @@
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/hash.h> 40#include <linux/hash.h>
41#include <linux/kern_levels.h>
41 42
42#include <asm/page.h> 43#include <asm/page.h>
43#include <asm/cmpxchg.h> 44#include <asm/cmpxchg.h>
@@ -130,6 +131,10 @@ module_param(dbg, bool, 0644);
130#define ACC_USER_MASK PT_USER_MASK 131#define ACC_USER_MASK PT_USER_MASK
131#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 132#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
132 133
134/* The mask for the R/X bits in EPT PTEs */
135#define PT64_EPT_READABLE_MASK 0x1ull
136#define PT64_EPT_EXECUTABLE_MASK 0x4ull
137
133#include <trace/events/kvm.h> 138#include <trace/events/kvm.h>
134 139
135#define CREATE_TRACE_POINTS 140#define CREATE_TRACE_POINTS
@@ -179,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask;
179static u64 __read_mostly shadow_mmio_mask; 184static u64 __read_mostly shadow_mmio_mask;
180static u64 __read_mostly shadow_present_mask; 185static u64 __read_mostly shadow_present_mask;
181 186
187/*
188 * The mask/value to distinguish a PTE that has been marked not-present for
189 * access tracking purposes.
190 * The mask would be either 0 if access tracking is disabled, or
191 * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
192 */
193static u64 __read_mostly shadow_acc_track_mask;
194static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
195
196/*
197 * The mask/shift to use for saving the original R/X bits when marking the PTE
198 * as not-present for access tracking purposes. We do not save the W bit as the
199 * PTEs being access tracked also need to be dirty tracked, so the W bit will be
200 * restored only when a write is attempted to the page.
201 */
202static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
203 PT64_EPT_EXECUTABLE_MASK;
204static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
205
182static void mmu_spte_set(u64 *sptep, u64 spte); 206static void mmu_spte_set(u64 *sptep, u64 spte);
183static void mmu_free_roots(struct kvm_vcpu *vcpu); 207static void mmu_free_roots(struct kvm_vcpu *vcpu);
184 208
@@ -188,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
188} 212}
189EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 213EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
190 214
215static inline bool is_access_track_spte(u64 spte)
216{
217 /* Always false if shadow_acc_track_mask is zero. */
218 return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
219}
220
191/* 221/*
192 * the low bit of the generation number is always presumed to be zero. 222 * the low bit of the generation number is always presumed to be zero.
193 * This disables mmio caching during memslot updates. The concept is 223 * This disables mmio caching during memslot updates. The concept is
@@ -285,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
285} 315}
286 316
287void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 317void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
288 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) 318 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
319 u64 acc_track_mask)
289{ 320{
290 shadow_user_mask = user_mask; 321 shadow_user_mask = user_mask;
291 shadow_accessed_mask = accessed_mask; 322 shadow_accessed_mask = accessed_mask;
@@ -293,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
293 shadow_nx_mask = nx_mask; 324 shadow_nx_mask = nx_mask;
294 shadow_x_mask = x_mask; 325 shadow_x_mask = x_mask;
295 shadow_present_mask = p_mask; 326 shadow_present_mask = p_mask;
327 shadow_acc_track_mask = acc_track_mask;
328 WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
296} 329}
297EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 330EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
298 331
332void kvm_mmu_clear_all_pte_masks(void)
333{
334 shadow_user_mask = 0;
335 shadow_accessed_mask = 0;
336 shadow_dirty_mask = 0;
337 shadow_nx_mask = 0;
338 shadow_x_mask = 0;
339 shadow_mmio_mask = 0;
340 shadow_present_mask = 0;
341 shadow_acc_track_mask = 0;
342}
343
299static int is_cpuid_PSE36(void) 344static int is_cpuid_PSE36(void)
300{ 345{
301 return 1; 346 return 1;
@@ -308,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
308 353
309static int is_shadow_present_pte(u64 pte) 354static int is_shadow_present_pte(u64 pte)
310{ 355{
311 return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); 356 return (pte != 0) && !is_mmio_spte(pte);
312} 357}
313 358
314static int is_large_pte(u64 pte) 359static int is_large_pte(u64 pte)
@@ -482,32 +527,32 @@ static bool spte_can_locklessly_be_made_writable(u64 spte)
482 527
483static bool spte_has_volatile_bits(u64 spte) 528static bool spte_has_volatile_bits(u64 spte)
484{ 529{
530 if (!is_shadow_present_pte(spte))
531 return false;
532
485 /* 533 /*
486 * Always atomically update spte if it can be updated 534 * Always atomically update spte if it can be updated
487 * out of mmu-lock, it can ensure dirty bit is not lost, 535 * out of mmu-lock, it can ensure dirty bit is not lost,
488 * also, it can help us to get a stable is_writable_pte() 536 * also, it can help us to get a stable is_writable_pte()
489 * to ensure tlb flush is not missed. 537 * to ensure tlb flush is not missed.
490 */ 538 */
491 if (spte_can_locklessly_be_made_writable(spte)) 539 if (spte_can_locklessly_be_made_writable(spte) ||
540 is_access_track_spte(spte))
492 return true; 541 return true;
493 542
494 if (!shadow_accessed_mask) 543 if (shadow_accessed_mask) {
495 return false; 544 if ((spte & shadow_accessed_mask) == 0 ||
496 545 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
497 if (!is_shadow_present_pte(spte)) 546 return true;
498 return false; 547 }
499
500 if ((spte & shadow_accessed_mask) &&
501 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
502 return false;
503 548
504 return true; 549 return false;
505} 550}
506 551
507static bool is_accessed_spte(u64 spte) 552static bool is_accessed_spte(u64 spte)
508{ 553{
509 return shadow_accessed_mask ? spte & shadow_accessed_mask 554 return shadow_accessed_mask ? spte & shadow_accessed_mask
510 : true; 555 : !is_access_track_spte(spte);
511} 556}
512 557
513static bool is_dirty_spte(u64 spte) 558static bool is_dirty_spte(u64 spte)
@@ -651,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
651 return __get_spte_lockless(sptep); 696 return __get_spte_lockless(sptep);
652} 697}
653 698
699static u64 mark_spte_for_access_track(u64 spte)
700{
701 if (shadow_accessed_mask != 0)
702 return spte & ~shadow_accessed_mask;
703
704 if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
705 return spte;
706
707 /*
708 * Verify that the write-protection that we do below will be fixable
709 * via the fast page fault path. Currently, that is always the case, at
710 * least when using EPT (which is when access tracking would be used).
711 */
712 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
713 !spte_can_locklessly_be_made_writable(spte),
714 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
715
716 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
717 shadow_acc_track_saved_bits_shift),
718 "kvm: Access Tracking saved bit locations are not zero\n");
719
720 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
721 shadow_acc_track_saved_bits_shift;
722 spte &= ~shadow_acc_track_mask;
723 spte |= shadow_acc_track_value;
724
725 return spte;
726}
727
728/* Returns the Accessed status of the PTE and resets it at the same time. */
729static bool mmu_spte_age(u64 *sptep)
730{
731 u64 spte = mmu_spte_get_lockless(sptep);
732
733 if (!is_accessed_spte(spte))
734 return false;
735
736 if (shadow_accessed_mask) {
737 clear_bit((ffs(shadow_accessed_mask) - 1),
738 (unsigned long *)sptep);
739 } else {
740 /*
741 * Capture the dirty status of the page, so that it doesn't get
742 * lost when the SPTE is marked for access tracking.
743 */
744 if (is_writable_pte(spte))
745 kvm_set_pfn_dirty(spte_to_pfn(spte));
746
747 spte = mark_spte_for_access_track(spte);
748 mmu_spte_update_no_track(sptep, spte);
749 }
750
751 return true;
752}
753
654static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 754static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
655{ 755{
656 /* 756 /*
@@ -1435,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1435restart: 1535restart:
1436 for_each_rmap_spte(rmap_head, &iter, sptep) { 1536 for_each_rmap_spte(rmap_head, &iter, sptep) {
1437 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", 1537 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1438 sptep, *sptep, gfn, level); 1538 sptep, *sptep, gfn, level);
1439 1539
1440 need_flush = 1; 1540 need_flush = 1;
1441 1541
@@ -1448,7 +1548,8 @@ restart:
1448 1548
1449 new_spte &= ~PT_WRITABLE_MASK; 1549 new_spte &= ~PT_WRITABLE_MASK;
1450 new_spte &= ~SPTE_HOST_WRITEABLE; 1550 new_spte &= ~SPTE_HOST_WRITEABLE;
1451 new_spte &= ~shadow_accessed_mask; 1551
1552 new_spte = mark_spte_for_access_track(new_spte);
1452 1553
1453 mmu_spte_clear_track_bits(sptep); 1554 mmu_spte_clear_track_bits(sptep);
1454 mmu_spte_set(sptep, new_spte); 1555 mmu_spte_set(sptep, new_spte);
@@ -1610,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1610 struct rmap_iterator uninitialized_var(iter); 1711 struct rmap_iterator uninitialized_var(iter);
1611 int young = 0; 1712 int young = 0;
1612 1713
1613 BUG_ON(!shadow_accessed_mask); 1714 for_each_rmap_spte(rmap_head, &iter, sptep)
1614 1715 young |= mmu_spte_age(sptep);
1615 for_each_rmap_spte(rmap_head, &iter, sptep) {
1616 if (*sptep & shadow_accessed_mask) {
1617 young = 1;
1618 clear_bit((ffs(shadow_accessed_mask) - 1),
1619 (unsigned long *)sptep);
1620 }
1621 }
1622 1716
1623 trace_kvm_age_page(gfn, level, slot, young); 1717 trace_kvm_age_page(gfn, level, slot, young);
1624 return young; 1718 return young;
@@ -1632,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1632 struct rmap_iterator iter; 1726 struct rmap_iterator iter;
1633 1727
1634 /* 1728 /*
1635 * If there's no access bit in the secondary pte set by the 1729 * If there's no access bit in the secondary pte set by the hardware and
1636 * hardware it's up to gup-fast/gup to set the access bit in 1730 * fast access tracking is also not enabled, it's up to gup-fast/gup to
1637 * the primary pte or in the page structure. 1731 * set the access bit in the primary pte or in the page structure.
1638 */ 1732 */
1639 if (!shadow_accessed_mask) 1733 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1640 goto out; 1734 goto out;
1641 1735
1642 for_each_rmap_spte(rmap_head, &iter, sptep) 1736 for_each_rmap_spte(rmap_head, &iter, sptep)
@@ -1671,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
1671 * This has some overhead, but not as much as the cost of swapping 1765 * This has some overhead, but not as much as the cost of swapping
1672 * out actively used pages or breaking up actively used hugepages. 1766 * out actively used pages or breaking up actively used hugepages.
1673 */ 1767 */
1674 if (!shadow_accessed_mask) 1768 if (!shadow_accessed_mask && !shadow_acc_track_mask)
1675 return kvm_handle_hva_range(kvm, start, end, 0, 1769 return kvm_handle_hva_range(kvm, start, end, 0,
1676 kvm_unmap_rmapp); 1770 kvm_unmap_rmapp);
1677 1771
@@ -2603,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2603 spte |= shadow_dirty_mask; 2697 spte |= shadow_dirty_mask;
2604 } 2698 }
2605 2699
2700 if (speculative)
2701 spte = mark_spte_for_access_track(spte);
2702
2606set_pte: 2703set_pte:
2607 if (mmu_spte_update(sptep, spte)) 2704 if (mmu_spte_update(sptep, spte))
2608 kvm_flush_remote_tlbs(vcpu->kvm); 2705 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2656,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
2656 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2753 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2657 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2754 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2658 is_large_pte(*sptep)? "2MB" : "4kB", 2755 is_large_pte(*sptep)? "2MB" : "4kB",
2659 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2756 *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
2660 *sptep, sptep); 2757 *sptep, sptep);
2661 if (!was_rmapped && is_large_pte(*sptep)) 2758 if (!was_rmapped && is_large_pte(*sptep))
2662 ++vcpu->kvm->stat.lpages; 2759 ++vcpu->kvm->stat.lpages;
@@ -2889,16 +2986,28 @@ static bool page_fault_can_be_fast(u32 error_code)
2889 if (unlikely(error_code & PFERR_RSVD_MASK)) 2986 if (unlikely(error_code & PFERR_RSVD_MASK))
2890 return false; 2987 return false;
2891 2988
2989 /* See if the page fault is due to an NX violation */
2990 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
2991 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
2992 return false;
2993
2892 /* 2994 /*
2893 * #PF can be fast only if the shadow page table is present and it 2995 * #PF can be fast if:
2894 * is caused by write-protect, that means we just need change the 2996 * 1. The shadow page table entry is not present, which could mean that
2895 * W bit of the spte which can be done out of mmu-lock. 2997 * the fault is potentially caused by access tracking (if enabled).
2998 * 2. The shadow page table entry is present and the fault
2999 * is caused by write-protect, that means we just need change the W
3000 * bit of the spte which can be done out of mmu-lock.
3001 *
3002 * However, if access tracking is disabled we know that a non-present
3003 * page must be a genuine page fault where we have to create a new SPTE.
3004 * So, if access tracking is disabled, we return true only for write
3005 * accesses to a present page.
2896 */ 3006 */
2897 if (!(error_code & PFERR_PRESENT_MASK) ||
2898 !(error_code & PFERR_WRITE_MASK))
2899 return false;
2900 3007
2901 return true; 3008 return shadow_acc_track_mask != 0 ||
3009 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3010 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
2902} 3011}
2903 3012
2904/* 3013/*
@@ -2907,17 +3016,26 @@ static bool page_fault_can_be_fast(u32 error_code)
2907 */ 3016 */
2908static bool 3017static bool
2909fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 3018fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2910 u64 *sptep, u64 spte) 3019 u64 *sptep, u64 old_spte,
3020 bool remove_write_prot, bool remove_acc_track)
2911{ 3021{
2912 gfn_t gfn; 3022 gfn_t gfn;
3023 u64 new_spte = old_spte;
2913 3024
2914 WARN_ON(!sp->role.direct); 3025 WARN_ON(!sp->role.direct);
2915 3026
2916 /* 3027 if (remove_acc_track) {
2917 * The gfn of direct spte is stable since it is calculated 3028 u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift)
2918 * by sp->gfn. 3029 & shadow_acc_track_saved_bits_mask;
2919 */ 3030
2920 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 3031 new_spte &= ~shadow_acc_track_mask;
3032 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
3033 shadow_acc_track_saved_bits_shift);
3034 new_spte |= saved_bits;
3035 }
3036
3037 if (remove_write_prot)
3038 new_spte |= PT_WRITABLE_MASK;
2921 3039
2922 /* 3040 /*
2923 * Theoretically we could also set dirty bit (and flush TLB) here in 3041 * Theoretically we could also set dirty bit (and flush TLB) here in
@@ -2931,10 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2931 * 3049 *
2932 * Compare with set_spte where instead shadow_dirty_mask is set. 3050 * Compare with set_spte where instead shadow_dirty_mask is set.
2933 */ 3051 */
2934 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) 3052 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
2935 return false; 3053 return false;
2936 3054
2937 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3055 if (remove_write_prot) {
3056 /*
3057 * The gfn of direct spte is stable since it is
3058 * calculated by sp->gfn.
3059 */
3060 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3061 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3062 }
2938 3063
2939 return true; 3064 return true;
2940} 3065}
@@ -2965,35 +3090,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
2965 break; 3090 break;
2966 3091
2967 do { 3092 do {
2968 /* 3093 bool remove_write_prot = false;
2969 * If the mapping has been changed, let the vcpu fault on the 3094 bool remove_acc_track;
2970 * same address again.
2971 */
2972 if (!is_shadow_present_pte(spte)) {
2973 fault_handled = true;
2974 break;
2975 }
2976 3095
2977 sp = page_header(__pa(iterator.sptep)); 3096 sp = page_header(__pa(iterator.sptep));
2978 if (!is_last_spte(spte, sp->role.level)) 3097 if (!is_last_spte(spte, sp->role.level))
2979 break; 3098 break;
2980 3099
2981 /* 3100 /*
2982 * Check if it is a spurious fault caused by TLB lazily flushed. 3101 * Check whether the memory access that caused the fault would
3102 * still cause it if it were to be performed right now. If not,
3103 * then this is a spurious fault caused by TLB lazily flushed,
3104 * or some other CPU has already fixed the PTE after the
3105 * current CPU took the fault.
2983 * 3106 *
2984 * Need not check the access of upper level table entries since 3107 * Need not check the access of upper level table entries since
2985 * they are always ACC_ALL. 3108 * they are always ACC_ALL.
2986 */ 3109 */
2987 if (is_writable_pte(spte)) { 3110
2988 fault_handled = true; 3111 if (error_code & PFERR_FETCH_MASK) {
2989 break; 3112 if ((spte & (shadow_x_mask | shadow_nx_mask))
3113 == shadow_x_mask) {
3114 fault_handled = true;
3115 break;
3116 }
3117 } else if (error_code & PFERR_WRITE_MASK) {
3118 if (is_writable_pte(spte)) {
3119 fault_handled = true;
3120 break;
3121 }
3122
3123 /*
3124 * Currently, to simplify the code, write-protection can
3125 * be removed in the fast path only if the SPTE was
3126 * write-protected for dirty-logging.
3127 */
3128 remove_write_prot =
3129 spte_can_locklessly_be_made_writable(spte);
3130 } else {
3131 /* Fault was on Read access */
3132 if (spte & PT_PRESENT_MASK) {
3133 fault_handled = true;
3134 break;
3135 }
2990 } 3136 }
2991 3137
2992 /* 3138 remove_acc_track = is_access_track_spte(spte);
2993 * Currently, to simplify the code, only the spte 3139
2994 * write-protected by dirty-log can be fast fixed. 3140 /* Verify that the fault can be handled in the fast path */
2995 */ 3141 if (!remove_acc_track && !remove_write_prot)
2996 if (!spte_can_locklessly_be_made_writable(spte))
2997 break; 3142 break;
2998 3143
2999 /* 3144 /*
@@ -3007,7 +3152,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3007 * 3152 *
3008 * See the comments in kvm_arch_commit_memory_region(). 3153 * See the comments in kvm_arch_commit_memory_region().
3009 */ 3154 */
3010 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 3155 if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot)
3011 break; 3156 break;
3012 3157
3013 /* 3158 /*
@@ -3016,7 +3161,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3016 * Documentation/virtual/kvm/locking.txt to get more detail. 3161 * Documentation/virtual/kvm/locking.txt to get more detail.
3017 */ 3162 */
3018 fault_handled = fast_pf_fix_direct_spte(vcpu, sp, 3163 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3019 iterator.sptep, spte); 3164 iterator.sptep, spte,
3165 remove_write_prot,
3166 remove_acc_track);
3020 if (fault_handled) 3167 if (fault_handled)
3021 break; 3168 break;
3022 3169
@@ -5105,6 +5252,8 @@ static void mmu_destroy_caches(void)
5105 5252
5106int kvm_mmu_module_init(void) 5253int kvm_mmu_module_init(void)
5107{ 5254{
5255 kvm_mmu_clear_all_pte_masks();
5256
5108 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 5257 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
5109 sizeof(struct pte_list_desc), 5258 sizeof(struct pte_list_desc),
5110 0, 0, NULL); 5259 0, 0, NULL);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6f53dedd9b96..d2fe3a51876c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6578,6 +6578,19 @@ static void wakeup_handler(void)
6578 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); 6578 spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
6579} 6579}
6580 6580
6581void vmx_enable_tdp(void)
6582{
6583 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
6584 enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
6585 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
6586 0ull, VMX_EPT_EXECUTABLE_MASK,
6587 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
6588 enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK);
6589
6590 ept_set_mmio_spte_mask();
6591 kvm_enable_tdp();
6592}
6593
6581static __init int hardware_setup(void) 6594static __init int hardware_setup(void)
6582{ 6595{
6583 int r = -ENOMEM, i, msr; 6596 int r = -ENOMEM, i, msr;
@@ -6703,16 +6716,9 @@ static __init int hardware_setup(void)
6703 /* SELF-IPI */ 6716 /* SELF-IPI */
6704 vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); 6717 vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
6705 6718
6706 if (enable_ept) { 6719 if (enable_ept)
6707 kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, 6720 vmx_enable_tdp();
6708 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 6721 else
6709 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
6710 0ull, VMX_EPT_EXECUTABLE_MASK,
6711 cpu_has_vmx_ept_execute_only() ?
6712 0ull : VMX_EPT_READABLE_MASK);
6713 ept_set_mmio_spte_mask();
6714 kvm_enable_tdp();
6715 } else
6716 kvm_disable_tdp(); 6722 kvm_disable_tdp();
6717 6723
6718 update_ple_window_actual_max(); 6724 update_ple_window_actual_max();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4aece8b0a4aa..c3ee5e29ea2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6025,7 +6025,7 @@ int kvm_arch_init(void *opaque)
6025 6025
6026 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6026 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
6027 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6027 PT_DIRTY_MASK, PT64_NX_MASK, 0,
6028 PT_PRESENT_MASK); 6028 PT_PRESENT_MASK, 0);
6029 kvm_timer_init(); 6029 kvm_timer_init();
6030 6030
6031 perf_register_guest_info_callbacks(&kvm_guest_cbs); 6031 perf_register_guest_info_callbacks(&kvm_guest_cbs);