diff options
author | Junaid Shahid <junaids@google.com> | 2016-12-06 19:46:16 -0500 |
---|---|---|
committer | Radim Krčmář <rkrcmar@redhat.com> | 2017-01-09 08:46:11 -0500 |
commit | f160c7b7bb322bf079a5bb4dd34c58f17553f193 (patch) | |
tree | df8f36cae081baeec8859cc367175d6dcaf36c85 | |
parent | 37f0e8fe6b10ee2ab52576caa721ee1282de74a6 (diff) |
kvm: x86: mmu: Lockless access tracking for Intel CPUs without EPT A bits.
This change implements lockless access tracking for Intel CPUs without EPT
A bits. This is achieved by marking the PTEs as not-present (but not
completely clearing them) when clear_flush_young() is called after marking
the pages as accessed. When an EPT Violation is generated as a result of
the VM accessing those pages, the PTEs are restored to their original values.
Signed-off-by: Junaid Shahid <junaids@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 279 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 26 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 2 |
5 files changed, 239 insertions, 80 deletions
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3272a5e4aaad..99a71d90b6ae 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h | |||
@@ -1064,7 +1064,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu); | |||
1064 | void kvm_mmu_init_vm(struct kvm *kvm); | 1064 | void kvm_mmu_init_vm(struct kvm *kvm); |
1065 | void kvm_mmu_uninit_vm(struct kvm *kvm); | 1065 | void kvm_mmu_uninit_vm(struct kvm *kvm); |
1066 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 1066 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
1067 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask); | 1067 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, |
1068 | u64 acc_track_mask); | ||
1068 | 1069 | ||
1069 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); | 1070 | void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); |
1070 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, | 1071 | void kvm_mmu_slot_remove_write_access(struct kvm *kvm, |
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index fc061cbb46e0..a22a4790f1ac 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h | |||
@@ -469,11 +469,14 @@ enum vmcs_field { | |||
469 | #define VMX_EPT_IPAT_BIT (1ull << 6) | 469 | #define VMX_EPT_IPAT_BIT (1ull << 6) |
470 | #define VMX_EPT_ACCESS_BIT (1ull << 8) | 470 | #define VMX_EPT_ACCESS_BIT (1ull << 8) |
471 | #define VMX_EPT_DIRTY_BIT (1ull << 9) | 471 | #define VMX_EPT_DIRTY_BIT (1ull << 9) |
472 | #define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \ | ||
473 | VMX_EPT_WRITABLE_MASK | \ | ||
474 | VMX_EPT_EXECUTABLE_MASK) | ||
475 | #define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) | ||
472 | 476 | ||
473 | /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ | 477 | /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ |
474 | #define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ | 478 | #define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ |
475 | VMX_EPT_EXECUTABLE_MASK) | 479 | VMX_EPT_EXECUTABLE_MASK) |
476 | |||
477 | 480 | ||
478 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul | 481 | #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul |
479 | 482 | ||
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b8b5259c8ebb..64821ca3a7c3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
39 | #include <linux/uaccess.h> | 39 | #include <linux/uaccess.h> |
40 | #include <linux/hash.h> | 40 | #include <linux/hash.h> |
41 | #include <linux/kern_levels.h> | ||
41 | 42 | ||
42 | #include <asm/page.h> | 43 | #include <asm/page.h> |
43 | #include <asm/cmpxchg.h> | 44 | #include <asm/cmpxchg.h> |
@@ -130,6 +131,10 @@ module_param(dbg, bool, 0644); | |||
130 | #define ACC_USER_MASK PT_USER_MASK | 131 | #define ACC_USER_MASK PT_USER_MASK |
131 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) | 132 | #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) |
132 | 133 | ||
134 | /* The mask for the R/X bits in EPT PTEs */ | ||
135 | #define PT64_EPT_READABLE_MASK 0x1ull | ||
136 | #define PT64_EPT_EXECUTABLE_MASK 0x4ull | ||
137 | |||
133 | #include <trace/events/kvm.h> | 138 | #include <trace/events/kvm.h> |
134 | 139 | ||
135 | #define CREATE_TRACE_POINTS | 140 | #define CREATE_TRACE_POINTS |
@@ -179,6 +184,25 @@ static u64 __read_mostly shadow_dirty_mask; | |||
179 | static u64 __read_mostly shadow_mmio_mask; | 184 | static u64 __read_mostly shadow_mmio_mask; |
180 | static u64 __read_mostly shadow_present_mask; | 185 | static u64 __read_mostly shadow_present_mask; |
181 | 186 | ||
187 | /* | ||
188 | * The mask/value to distinguish a PTE that has been marked not-present for | ||
189 | * access tracking purposes. | ||
190 | * The mask would be either 0 if access tracking is disabled, or | ||
191 | * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled. | ||
192 | */ | ||
193 | static u64 __read_mostly shadow_acc_track_mask; | ||
194 | static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; | ||
195 | |||
196 | /* | ||
197 | * The mask/shift to use for saving the original R/X bits when marking the PTE | ||
198 | * as not-present for access tracking purposes. We do not save the W bit as the | ||
199 | * PTEs being access tracked also need to be dirty tracked, so the W bit will be | ||
200 | * restored only when a write is attempted to the page. | ||
201 | */ | ||
202 | static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK | | ||
203 | PT64_EPT_EXECUTABLE_MASK; | ||
204 | static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT; | ||
205 | |||
182 | static void mmu_spte_set(u64 *sptep, u64 spte); | 206 | static void mmu_spte_set(u64 *sptep, u64 spte); |
183 | static void mmu_free_roots(struct kvm_vcpu *vcpu); | 207 | static void mmu_free_roots(struct kvm_vcpu *vcpu); |
184 | 208 | ||
@@ -188,6 +212,12 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | |||
188 | } | 212 | } |
189 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | 213 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); |
190 | 214 | ||
215 | static inline bool is_access_track_spte(u64 spte) | ||
216 | { | ||
217 | /* Always false if shadow_acc_track_mask is zero. */ | ||
218 | return (spte & shadow_acc_track_mask) == shadow_acc_track_value; | ||
219 | } | ||
220 | |||
191 | /* | 221 | /* |
192 | * the low bit of the generation number is always presumed to be zero. | 222 | * the low bit of the generation number is always presumed to be zero. |
193 | * This disables mmio caching during memslot updates. The concept is | 223 | * This disables mmio caching during memslot updates. The concept is |
@@ -285,7 +315,8 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) | |||
285 | } | 315 | } |
286 | 316 | ||
287 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | 317 | void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, |
288 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask) | 318 | u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, |
319 | u64 acc_track_mask) | ||
289 | { | 320 | { |
290 | shadow_user_mask = user_mask; | 321 | shadow_user_mask = user_mask; |
291 | shadow_accessed_mask = accessed_mask; | 322 | shadow_accessed_mask = accessed_mask; |
@@ -293,9 +324,23 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, | |||
293 | shadow_nx_mask = nx_mask; | 324 | shadow_nx_mask = nx_mask; |
294 | shadow_x_mask = x_mask; | 325 | shadow_x_mask = x_mask; |
295 | shadow_present_mask = p_mask; | 326 | shadow_present_mask = p_mask; |
327 | shadow_acc_track_mask = acc_track_mask; | ||
328 | WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0); | ||
296 | } | 329 | } |
297 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); | 330 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); |
298 | 331 | ||
332 | void kvm_mmu_clear_all_pte_masks(void) | ||
333 | { | ||
334 | shadow_user_mask = 0; | ||
335 | shadow_accessed_mask = 0; | ||
336 | shadow_dirty_mask = 0; | ||
337 | shadow_nx_mask = 0; | ||
338 | shadow_x_mask = 0; | ||
339 | shadow_mmio_mask = 0; | ||
340 | shadow_present_mask = 0; | ||
341 | shadow_acc_track_mask = 0; | ||
342 | } | ||
343 | |||
299 | static int is_cpuid_PSE36(void) | 344 | static int is_cpuid_PSE36(void) |
300 | { | 345 | { |
301 | return 1; | 346 | return 1; |
@@ -308,7 +353,7 @@ static int is_nx(struct kvm_vcpu *vcpu) | |||
308 | 353 | ||
309 | static int is_shadow_present_pte(u64 pte) | 354 | static int is_shadow_present_pte(u64 pte) |
310 | { | 355 | { |
311 | return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte); | 356 | return (pte != 0) && !is_mmio_spte(pte); |
312 | } | 357 | } |
313 | 358 | ||
314 | static int is_large_pte(u64 pte) | 359 | static int is_large_pte(u64 pte) |
@@ -482,32 +527,32 @@ static bool spte_can_locklessly_be_made_writable(u64 spte) | |||
482 | 527 | ||
483 | static bool spte_has_volatile_bits(u64 spte) | 528 | static bool spte_has_volatile_bits(u64 spte) |
484 | { | 529 | { |
530 | if (!is_shadow_present_pte(spte)) | ||
531 | return false; | ||
532 | |||
485 | /* | 533 | /* |
486 | * Always atomically update spte if it can be updated | 534 | * Always atomically update spte if it can be updated |
487 | * out of mmu-lock, it can ensure dirty bit is not lost, | 535 | * out of mmu-lock, it can ensure dirty bit is not lost, |
488 | * also, it can help us to get a stable is_writable_pte() | 536 | * also, it can help us to get a stable is_writable_pte() |
489 | * to ensure tlb flush is not missed. | 537 | * to ensure tlb flush is not missed. |
490 | */ | 538 | */ |
491 | if (spte_can_locklessly_be_made_writable(spte)) | 539 | if (spte_can_locklessly_be_made_writable(spte) || |
540 | is_access_track_spte(spte)) | ||
492 | return true; | 541 | return true; |
493 | 542 | ||
494 | if (!shadow_accessed_mask) | 543 | if (shadow_accessed_mask) { |
495 | return false; | 544 | if ((spte & shadow_accessed_mask) == 0 || |
496 | 545 | (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0)) | |
497 | if (!is_shadow_present_pte(spte)) | 546 | return true; |
498 | return false; | 547 | } |
499 | |||
500 | if ((spte & shadow_accessed_mask) && | ||
501 | (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) | ||
502 | return false; | ||
503 | 548 | ||
504 | return true; | 549 | return false; |
505 | } | 550 | } |
506 | 551 | ||
507 | static bool is_accessed_spte(u64 spte) | 552 | static bool is_accessed_spte(u64 spte) |
508 | { | 553 | { |
509 | return shadow_accessed_mask ? spte & shadow_accessed_mask | 554 | return shadow_accessed_mask ? spte & shadow_accessed_mask |
510 | : true; | 555 | : !is_access_track_spte(spte); |
511 | } | 556 | } |
512 | 557 | ||
513 | static bool is_dirty_spte(u64 spte) | 558 | static bool is_dirty_spte(u64 spte) |
@@ -651,6 +696,61 @@ static u64 mmu_spte_get_lockless(u64 *sptep) | |||
651 | return __get_spte_lockless(sptep); | 696 | return __get_spte_lockless(sptep); |
652 | } | 697 | } |
653 | 698 | ||
699 | static u64 mark_spte_for_access_track(u64 spte) | ||
700 | { | ||
701 | if (shadow_accessed_mask != 0) | ||
702 | return spte & ~shadow_accessed_mask; | ||
703 | |||
704 | if (shadow_acc_track_mask == 0 || is_access_track_spte(spte)) | ||
705 | return spte; | ||
706 | |||
707 | /* | ||
708 | * Verify that the write-protection that we do below will be fixable | ||
709 | * via the fast page fault path. Currently, that is always the case, at | ||
710 | * least when using EPT (which is when access tracking would be used). | ||
711 | */ | ||
712 | WARN_ONCE((spte & PT_WRITABLE_MASK) && | ||
713 | !spte_can_locklessly_be_made_writable(spte), | ||
714 | "kvm: Writable SPTE is not locklessly dirty-trackable\n"); | ||
715 | |||
716 | WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask << | ||
717 | shadow_acc_track_saved_bits_shift), | ||
718 | "kvm: Access Tracking saved bit locations are not zero\n"); | ||
719 | |||
720 | spte |= (spte & shadow_acc_track_saved_bits_mask) << | ||
721 | shadow_acc_track_saved_bits_shift; | ||
722 | spte &= ~shadow_acc_track_mask; | ||
723 | spte |= shadow_acc_track_value; | ||
724 | |||
725 | return spte; | ||
726 | } | ||
727 | |||
728 | /* Returns the Accessed status of the PTE and resets it at the same time. */ | ||
729 | static bool mmu_spte_age(u64 *sptep) | ||
730 | { | ||
731 | u64 spte = mmu_spte_get_lockless(sptep); | ||
732 | |||
733 | if (!is_accessed_spte(spte)) | ||
734 | return false; | ||
735 | |||
736 | if (shadow_accessed_mask) { | ||
737 | clear_bit((ffs(shadow_accessed_mask) - 1), | ||
738 | (unsigned long *)sptep); | ||
739 | } else { | ||
740 | /* | ||
741 | * Capture the dirty status of the page, so that it doesn't get | ||
742 | * lost when the SPTE is marked for access tracking. | ||
743 | */ | ||
744 | if (is_writable_pte(spte)) | ||
745 | kvm_set_pfn_dirty(spte_to_pfn(spte)); | ||
746 | |||
747 | spte = mark_spte_for_access_track(spte); | ||
748 | mmu_spte_update_no_track(sptep, spte); | ||
749 | } | ||
750 | |||
751 | return true; | ||
752 | } | ||
753 | |||
654 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) | 754 | static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) |
655 | { | 755 | { |
656 | /* | 756 | /* |
@@ -1435,7 +1535,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, | |||
1435 | restart: | 1535 | restart: |
1436 | for_each_rmap_spte(rmap_head, &iter, sptep) { | 1536 | for_each_rmap_spte(rmap_head, &iter, sptep) { |
1437 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", | 1537 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", |
1438 | sptep, *sptep, gfn, level); | 1538 | sptep, *sptep, gfn, level); |
1439 | 1539 | ||
1440 | need_flush = 1; | 1540 | need_flush = 1; |
1441 | 1541 | ||
@@ -1448,7 +1548,8 @@ restart: | |||
1448 | 1548 | ||
1449 | new_spte &= ~PT_WRITABLE_MASK; | 1549 | new_spte &= ~PT_WRITABLE_MASK; |
1450 | new_spte &= ~SPTE_HOST_WRITEABLE; | 1550 | new_spte &= ~SPTE_HOST_WRITEABLE; |
1451 | new_spte &= ~shadow_accessed_mask; | 1551 | |
1552 | new_spte = mark_spte_for_access_track(new_spte); | ||
1452 | 1553 | ||
1453 | mmu_spte_clear_track_bits(sptep); | 1554 | mmu_spte_clear_track_bits(sptep); |
1454 | mmu_spte_set(sptep, new_spte); | 1555 | mmu_spte_set(sptep, new_spte); |
@@ -1610,15 +1711,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, | |||
1610 | struct rmap_iterator uninitialized_var(iter); | 1711 | struct rmap_iterator uninitialized_var(iter); |
1611 | int young = 0; | 1712 | int young = 0; |
1612 | 1713 | ||
1613 | BUG_ON(!shadow_accessed_mask); | 1714 | for_each_rmap_spte(rmap_head, &iter, sptep) |
1614 | 1715 | young |= mmu_spte_age(sptep); | |
1615 | for_each_rmap_spte(rmap_head, &iter, sptep) { | ||
1616 | if (*sptep & shadow_accessed_mask) { | ||
1617 | young = 1; | ||
1618 | clear_bit((ffs(shadow_accessed_mask) - 1), | ||
1619 | (unsigned long *)sptep); | ||
1620 | } | ||
1621 | } | ||
1622 | 1716 | ||
1623 | trace_kvm_age_page(gfn, level, slot, young); | 1717 | trace_kvm_age_page(gfn, level, slot, young); |
1624 | return young; | 1718 | return young; |
@@ -1632,11 +1726,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, | |||
1632 | struct rmap_iterator iter; | 1726 | struct rmap_iterator iter; |
1633 | 1727 | ||
1634 | /* | 1728 | /* |
1635 | * If there's no access bit in the secondary pte set by the | 1729 | * If there's no access bit in the secondary pte set by the hardware and |
1636 | * hardware it's up to gup-fast/gup to set the access bit in | 1730 | * fast access tracking is also not enabled, it's up to gup-fast/gup to |
1637 | * the primary pte or in the page structure. | 1731 | * set the access bit in the primary pte or in the page structure. |
1638 | */ | 1732 | */ |
1639 | if (!shadow_accessed_mask) | 1733 | if (!shadow_accessed_mask && !shadow_acc_track_mask) |
1640 | goto out; | 1734 | goto out; |
1641 | 1735 | ||
1642 | for_each_rmap_spte(rmap_head, &iter, sptep) | 1736 | for_each_rmap_spte(rmap_head, &iter, sptep) |
@@ -1671,7 +1765,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) | |||
1671 | * This has some overhead, but not as much as the cost of swapping | 1765 | * This has some overhead, but not as much as the cost of swapping |
1672 | * out actively used pages or breaking up actively used hugepages. | 1766 | * out actively used pages or breaking up actively used hugepages. |
1673 | */ | 1767 | */ |
1674 | if (!shadow_accessed_mask) | 1768 | if (!shadow_accessed_mask && !shadow_acc_track_mask) |
1675 | return kvm_handle_hva_range(kvm, start, end, 0, | 1769 | return kvm_handle_hva_range(kvm, start, end, 0, |
1676 | kvm_unmap_rmapp); | 1770 | kvm_unmap_rmapp); |
1677 | 1771 | ||
@@ -2603,6 +2697,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2603 | spte |= shadow_dirty_mask; | 2697 | spte |= shadow_dirty_mask; |
2604 | } | 2698 | } |
2605 | 2699 | ||
2700 | if (speculative) | ||
2701 | spte = mark_spte_for_access_track(spte); | ||
2702 | |||
2606 | set_pte: | 2703 | set_pte: |
2607 | if (mmu_spte_update(sptep, spte)) | 2704 | if (mmu_spte_update(sptep, spte)) |
2608 | kvm_flush_remote_tlbs(vcpu->kvm); | 2705 | kvm_flush_remote_tlbs(vcpu->kvm); |
@@ -2656,7 +2753,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | |||
2656 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); | 2753 | pgprintk("%s: setting spte %llx\n", __func__, *sptep); |
2657 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", | 2754 | pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", |
2658 | is_large_pte(*sptep)? "2MB" : "4kB", | 2755 | is_large_pte(*sptep)? "2MB" : "4kB", |
2659 | *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, | 2756 | *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn, |
2660 | *sptep, sptep); | 2757 | *sptep, sptep); |
2661 | if (!was_rmapped && is_large_pte(*sptep)) | 2758 | if (!was_rmapped && is_large_pte(*sptep)) |
2662 | ++vcpu->kvm->stat.lpages; | 2759 | ++vcpu->kvm->stat.lpages; |
@@ -2889,16 +2986,28 @@ static bool page_fault_can_be_fast(u32 error_code) | |||
2889 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 2986 | if (unlikely(error_code & PFERR_RSVD_MASK)) |
2890 | return false; | 2987 | return false; |
2891 | 2988 | ||
2989 | /* See if the page fault is due to an NX violation */ | ||
2990 | if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)) | ||
2991 | == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK)))) | ||
2992 | return false; | ||
2993 | |||
2892 | /* | 2994 | /* |
2893 | * #PF can be fast only if the shadow page table is present and it | 2995 | * #PF can be fast if: |
2894 | * is caused by write-protect, that means we just need change the | 2996 | * 1. The shadow page table entry is not present, which could mean that |
2895 | * W bit of the spte which can be done out of mmu-lock. | 2997 | * the fault is potentially caused by access tracking (if enabled). |
2998 | * 2. The shadow page table entry is present and the fault | ||
2999 | * is caused by write-protect, that means we just need change the W | ||
3000 | * bit of the spte which can be done out of mmu-lock. | ||
3001 | * | ||
3002 | * However, if access tracking is disabled we know that a non-present | ||
3003 | * page must be a genuine page fault where we have to create a new SPTE. | ||
3004 | * So, if access tracking is disabled, we return true only for write | ||
3005 | * accesses to a present page. | ||
2896 | */ | 3006 | */ |
2897 | if (!(error_code & PFERR_PRESENT_MASK) || | ||
2898 | !(error_code & PFERR_WRITE_MASK)) | ||
2899 | return false; | ||
2900 | 3007 | ||
2901 | return true; | 3008 | return shadow_acc_track_mask != 0 || |
3009 | ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)) | ||
3010 | == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK)); | ||
2902 | } | 3011 | } |
2903 | 3012 | ||
2904 | /* | 3013 | /* |
@@ -2907,17 +3016,26 @@ static bool page_fault_can_be_fast(u32 error_code) | |||
2907 | */ | 3016 | */ |
2908 | static bool | 3017 | static bool |
2909 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | 3018 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, |
2910 | u64 *sptep, u64 spte) | 3019 | u64 *sptep, u64 old_spte, |
3020 | bool remove_write_prot, bool remove_acc_track) | ||
2911 | { | 3021 | { |
2912 | gfn_t gfn; | 3022 | gfn_t gfn; |
3023 | u64 new_spte = old_spte; | ||
2913 | 3024 | ||
2914 | WARN_ON(!sp->role.direct); | 3025 | WARN_ON(!sp->role.direct); |
2915 | 3026 | ||
2916 | /* | 3027 | if (remove_acc_track) { |
2917 | * The gfn of direct spte is stable since it is calculated | 3028 | u64 saved_bits = (old_spte >> shadow_acc_track_saved_bits_shift) |
2918 | * by sp->gfn. | 3029 | & shadow_acc_track_saved_bits_mask; |
2919 | */ | 3030 | |
2920 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | 3031 | new_spte &= ~shadow_acc_track_mask; |
3032 | new_spte &= ~(shadow_acc_track_saved_bits_mask << | ||
3033 | shadow_acc_track_saved_bits_shift); | ||
3034 | new_spte |= saved_bits; | ||
3035 | } | ||
3036 | |||
3037 | if (remove_write_prot) | ||
3038 | new_spte |= PT_WRITABLE_MASK; | ||
2921 | 3039 | ||
2922 | /* | 3040 | /* |
2923 | * Theoretically we could also set dirty bit (and flush TLB) here in | 3041 | * Theoretically we could also set dirty bit (and flush TLB) here in |
@@ -2931,10 +3049,17 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
2931 | * | 3049 | * |
2932 | * Compare with set_spte where instead shadow_dirty_mask is set. | 3050 | * Compare with set_spte where instead shadow_dirty_mask is set. |
2933 | */ | 3051 | */ |
2934 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) != spte) | 3052 | if (cmpxchg64(sptep, old_spte, new_spte) != old_spte) |
2935 | return false; | 3053 | return false; |
2936 | 3054 | ||
2937 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | 3055 | if (remove_write_prot) { |
3056 | /* | ||
3057 | * The gfn of direct spte is stable since it is | ||
3058 | * calculated by sp->gfn. | ||
3059 | */ | ||
3060 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
3061 | kvm_vcpu_mark_page_dirty(vcpu, gfn); | ||
3062 | } | ||
2938 | 3063 | ||
2939 | return true; | 3064 | return true; |
2940 | } | 3065 | } |
@@ -2965,35 +3090,55 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
2965 | break; | 3090 | break; |
2966 | 3091 | ||
2967 | do { | 3092 | do { |
2968 | /* | 3093 | bool remove_write_prot = false; |
2969 | * If the mapping has been changed, let the vcpu fault on the | 3094 | bool remove_acc_track; |
2970 | * same address again. | ||
2971 | */ | ||
2972 | if (!is_shadow_present_pte(spte)) { | ||
2973 | fault_handled = true; | ||
2974 | break; | ||
2975 | } | ||
2976 | 3095 | ||
2977 | sp = page_header(__pa(iterator.sptep)); | 3096 | sp = page_header(__pa(iterator.sptep)); |
2978 | if (!is_last_spte(spte, sp->role.level)) | 3097 | if (!is_last_spte(spte, sp->role.level)) |
2979 | break; | 3098 | break; |
2980 | 3099 | ||
2981 | /* | 3100 | /* |
2982 | * Check if it is a spurious fault caused by TLB lazily flushed. | 3101 | * Check whether the memory access that caused the fault would |
3102 | * still cause it if it were to be performed right now. If not, | ||
3103 | * then this is a spurious fault caused by TLB lazily flushed, | ||
3104 | * or some other CPU has already fixed the PTE after the | ||
3105 | * current CPU took the fault. | ||
2983 | * | 3106 | * |
2984 | * Need not check the access of upper level table entries since | 3107 | * Need not check the access of upper level table entries since |
2985 | * they are always ACC_ALL. | 3108 | * they are always ACC_ALL. |
2986 | */ | 3109 | */ |
2987 | if (is_writable_pte(spte)) { | 3110 | |
2988 | fault_handled = true; | 3111 | if (error_code & PFERR_FETCH_MASK) { |
2989 | break; | 3112 | if ((spte & (shadow_x_mask | shadow_nx_mask)) |
3113 | == shadow_x_mask) { | ||
3114 | fault_handled = true; | ||
3115 | break; | ||
3116 | } | ||
3117 | } else if (error_code & PFERR_WRITE_MASK) { | ||
3118 | if (is_writable_pte(spte)) { | ||
3119 | fault_handled = true; | ||
3120 | break; | ||
3121 | } | ||
3122 | |||
3123 | /* | ||
3124 | * Currently, to simplify the code, write-protection can | ||
3125 | * be removed in the fast path only if the SPTE was | ||
3126 | * write-protected for dirty-logging. | ||
3127 | */ | ||
3128 | remove_write_prot = | ||
3129 | spte_can_locklessly_be_made_writable(spte); | ||
3130 | } else { | ||
3131 | /* Fault was on Read access */ | ||
3132 | if (spte & PT_PRESENT_MASK) { | ||
3133 | fault_handled = true; | ||
3134 | break; | ||
3135 | } | ||
2990 | } | 3136 | } |
2991 | 3137 | ||
2992 | /* | 3138 | remove_acc_track = is_access_track_spte(spte); |
2993 | * Currently, to simplify the code, only the spte | 3139 | |
2994 | * write-protected by dirty-log can be fast fixed. | 3140 | /* Verify that the fault can be handled in the fast path */ |
2995 | */ | 3141 | if (!remove_acc_track && !remove_write_prot) |
2996 | if (!spte_can_locklessly_be_made_writable(spte)) | ||
2997 | break; | 3142 | break; |
2998 | 3143 | ||
2999 | /* | 3144 | /* |
@@ -3007,7 +3152,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
3007 | * | 3152 | * |
3008 | * See the comments in kvm_arch_commit_memory_region(). | 3153 | * See the comments in kvm_arch_commit_memory_region(). |
3009 | */ | 3154 | */ |
3010 | if (sp->role.level > PT_PAGE_TABLE_LEVEL) | 3155 | if (sp->role.level > PT_PAGE_TABLE_LEVEL && remove_write_prot) |
3011 | break; | 3156 | break; |
3012 | 3157 | ||
3013 | /* | 3158 | /* |
@@ -3016,7 +3161,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | |||
3016 | * Documentation/virtual/kvm/locking.txt to get more detail. | 3161 | * Documentation/virtual/kvm/locking.txt to get more detail. |
3017 | */ | 3162 | */ |
3018 | fault_handled = fast_pf_fix_direct_spte(vcpu, sp, | 3163 | fault_handled = fast_pf_fix_direct_spte(vcpu, sp, |
3019 | iterator.sptep, spte); | 3164 | iterator.sptep, spte, |
3165 | remove_write_prot, | ||
3166 | remove_acc_track); | ||
3020 | if (fault_handled) | 3167 | if (fault_handled) |
3021 | break; | 3168 | break; |
3022 | 3169 | ||
@@ -5105,6 +5252,8 @@ static void mmu_destroy_caches(void) | |||
5105 | 5252 | ||
5106 | int kvm_mmu_module_init(void) | 5253 | int kvm_mmu_module_init(void) |
5107 | { | 5254 | { |
5255 | kvm_mmu_clear_all_pte_masks(); | ||
5256 | |||
5108 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", | 5257 | pte_list_desc_cache = kmem_cache_create("pte_list_desc", |
5109 | sizeof(struct pte_list_desc), | 5258 | sizeof(struct pte_list_desc), |
5110 | 0, 0, NULL); | 5259 | 0, 0, NULL); |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6f53dedd9b96..d2fe3a51876c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -6578,6 +6578,19 @@ static void wakeup_handler(void) | |||
6578 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); | 6578 | spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); |
6579 | } | 6579 | } |
6580 | 6580 | ||
6581 | void vmx_enable_tdp(void) | ||
6582 | { | ||
6583 | kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, | ||
6584 | enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull, | ||
6585 | enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, | ||
6586 | 0ull, VMX_EPT_EXECUTABLE_MASK, | ||
6587 | cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, | ||
6588 | enable_ept_ad_bits ? 0ull : SPTE_SPECIAL_MASK | VMX_EPT_RWX_MASK); | ||
6589 | |||
6590 | ept_set_mmio_spte_mask(); | ||
6591 | kvm_enable_tdp(); | ||
6592 | } | ||
6593 | |||
6581 | static __init int hardware_setup(void) | 6594 | static __init int hardware_setup(void) |
6582 | { | 6595 | { |
6583 | int r = -ENOMEM, i, msr; | 6596 | int r = -ENOMEM, i, msr; |
@@ -6703,16 +6716,9 @@ static __init int hardware_setup(void) | |||
6703 | /* SELF-IPI */ | 6716 | /* SELF-IPI */ |
6704 | vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); | 6717 | vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); |
6705 | 6718 | ||
6706 | if (enable_ept) { | 6719 | if (enable_ept) |
6707 | kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, | 6720 | vmx_enable_tdp(); |
6708 | (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, | 6721 | else |
6709 | (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, | ||
6710 | 0ull, VMX_EPT_EXECUTABLE_MASK, | ||
6711 | cpu_has_vmx_ept_execute_only() ? | ||
6712 | 0ull : VMX_EPT_READABLE_MASK); | ||
6713 | ept_set_mmio_spte_mask(); | ||
6714 | kvm_enable_tdp(); | ||
6715 | } else | ||
6716 | kvm_disable_tdp(); | 6722 | kvm_disable_tdp(); |
6717 | 6723 | ||
6718 | update_ple_window_actual_max(); | 6724 | update_ple_window_actual_max(); |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4aece8b0a4aa..c3ee5e29ea2a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -6025,7 +6025,7 @@ int kvm_arch_init(void *opaque) | |||
6025 | 6025 | ||
6026 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, | 6026 | kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, |
6027 | PT_DIRTY_MASK, PT64_NX_MASK, 0, | 6027 | PT_DIRTY_MASK, PT64_NX_MASK, 0, |
6028 | PT_PRESENT_MASK); | 6028 | PT_PRESENT_MASK, 0); |
6029 | kvm_timer_init(); | 6029 | kvm_timer_init(); |
6030 | 6030 | ||
6031 | perf_register_guest_info_callbacks(&kvm_guest_cbs); | 6031 | perf_register_guest_info_callbacks(&kvm_guest_cbs); |