diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kvm/mmu.c | 144 |
1 files changed, 127 insertions, 17 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b160652f7eee..8637bffbdb4a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -446,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) | |||
446 | } | 446 | } |
447 | #endif | 447 | #endif |
448 | 448 | ||
449 | static bool spte_is_locklessly_modifiable(u64 spte) | ||
450 | { | ||
451 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | ||
452 | } | ||
453 | |||
449 | static bool spte_has_volatile_bits(u64 spte) | 454 | static bool spte_has_volatile_bits(u64 spte) |
450 | { | 455 | { |
456 | /* | ||
457 | * Always atomicly update spte if it can be updated | ||
458 | * out of mmu-lock, it can ensure dirty bit is not lost, | ||
459 | * also, it can help us to get a stable is_writable_pte() | ||
460 | * to ensure tlb flush is not missed. | ||
461 | */ | ||
462 | if (spte_is_locklessly_modifiable(spte)) | ||
463 | return true; | ||
464 | |||
451 | if (!shadow_accessed_mask) | 465 | if (!shadow_accessed_mask) |
452 | return false; | 466 | return false; |
453 | 467 | ||
@@ -489,7 +503,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) | |||
489 | */ | 503 | */ |
490 | static bool mmu_spte_update(u64 *sptep, u64 new_spte) | 504 | static bool mmu_spte_update(u64 *sptep, u64 new_spte) |
491 | { | 505 | { |
492 | u64 mask, old_spte = *sptep; | 506 | u64 old_spte = *sptep; |
493 | bool ret = false; | 507 | bool ret = false; |
494 | 508 | ||
495 | WARN_ON(!is_rmap_spte(new_spte)); | 509 | WARN_ON(!is_rmap_spte(new_spte)); |
@@ -499,17 +513,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) | |||
499 | return ret; | 513 | return ret; |
500 | } | 514 | } |
501 | 515 | ||
502 | new_spte |= old_spte & shadow_dirty_mask; | 516 | if (!spte_has_volatile_bits(old_spte)) |
503 | |||
504 | mask = shadow_accessed_mask; | ||
505 | if (is_writable_pte(old_spte)) | ||
506 | mask |= shadow_dirty_mask; | ||
507 | |||
508 | if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) | ||
509 | __update_clear_spte_fast(sptep, new_spte); | 517 | __update_clear_spte_fast(sptep, new_spte); |
510 | else | 518 | else |
511 | old_spte = __update_clear_spte_slow(sptep, new_spte); | 519 | old_spte = __update_clear_spte_slow(sptep, new_spte); |
512 | 520 | ||
521 | /* | ||
522 | * For the spte updated out of mmu-lock is safe, since | ||
523 | * we always atomicly update it, see the comments in | ||
524 | * spte_has_volatile_bits(). | ||
525 | */ | ||
513 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) | 526 | if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) |
514 | ret = true; | 527 | ret = true; |
515 | 528 | ||
@@ -1085,11 +1098,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) | |||
1085 | kvm_flush_remote_tlbs(vcpu->kvm); | 1098 | kvm_flush_remote_tlbs(vcpu->kvm); |
1086 | } | 1099 | } |
1087 | 1100 | ||
1088 | static bool spte_is_locklessly_modifiable(u64 spte) | ||
1089 | { | ||
1090 | return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); | ||
1091 | } | ||
1092 | |||
1093 | /* | 1101 | /* |
1094 | * Write-protect on the specified @sptep, @pt_protect indicates whether | 1102 | * Write-protect on the specified @sptep, @pt_protect indicates whether |
1095 | * spte writ-protection is caused by protecting shadow page table. | 1103 | * spte writ-protection is caused by protecting shadow page table. |
@@ -2677,18 +2685,114 @@ exit: | |||
2677 | return ret; | 2685 | return ret; |
2678 | } | 2686 | } |
2679 | 2687 | ||
2688 | static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) | ||
2689 | { | ||
2690 | /* | ||
2691 | * #PF can be fast only if the shadow page table is present and it | ||
2692 | * is caused by write-protect, that means we just need change the | ||
2693 | * W bit of the spte which can be done out of mmu-lock. | ||
2694 | */ | ||
2695 | if (!(error_code & PFERR_PRESENT_MASK) || | ||
2696 | !(error_code & PFERR_WRITE_MASK)) | ||
2697 | return false; | ||
2698 | |||
2699 | return true; | ||
2700 | } | ||
2701 | |||
2702 | static bool | ||
2703 | fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) | ||
2704 | { | ||
2705 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | ||
2706 | gfn_t gfn; | ||
2707 | |||
2708 | WARN_ON(!sp->role.direct); | ||
2709 | |||
2710 | /* | ||
2711 | * The gfn of direct spte is stable since it is calculated | ||
2712 | * by sp->gfn. | ||
2713 | */ | ||
2714 | gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); | ||
2715 | |||
2716 | if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) | ||
2717 | mark_page_dirty(vcpu->kvm, gfn); | ||
2718 | |||
2719 | return true; | ||
2720 | } | ||
2721 | |||
2722 | /* | ||
2723 | * Return value: | ||
2724 | * - true: let the vcpu to access on the same address again. | ||
2725 | * - false: let the real page fault path to fix it. | ||
2726 | */ | ||
2727 | static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, | ||
2728 | u32 error_code) | ||
2729 | { | ||
2730 | struct kvm_shadow_walk_iterator iterator; | ||
2731 | bool ret = false; | ||
2732 | u64 spte = 0ull; | ||
2733 | |||
2734 | if (!page_fault_can_be_fast(vcpu, error_code)) | ||
2735 | return false; | ||
2736 | |||
2737 | walk_shadow_page_lockless_begin(vcpu); | ||
2738 | for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) | ||
2739 | if (!is_shadow_present_pte(spte) || iterator.level < level) | ||
2740 | break; | ||
2741 | |||
2742 | /* | ||
2743 | * If the mapping has been changed, let the vcpu fault on the | ||
2744 | * same address again. | ||
2745 | */ | ||
2746 | if (!is_rmap_spte(spte)) { | ||
2747 | ret = true; | ||
2748 | goto exit; | ||
2749 | } | ||
2750 | |||
2751 | if (!is_last_spte(spte, level)) | ||
2752 | goto exit; | ||
2753 | |||
2754 | /* | ||
2755 | * Check if it is a spurious fault caused by TLB lazily flushed. | ||
2756 | * | ||
2757 | * Need not check the access of upper level table entries since | ||
2758 | * they are always ACC_ALL. | ||
2759 | */ | ||
2760 | if (is_writable_pte(spte)) { | ||
2761 | ret = true; | ||
2762 | goto exit; | ||
2763 | } | ||
2764 | |||
2765 | /* | ||
2766 | * Currently, to simplify the code, only the spte write-protected | ||
2767 | * by dirty-log can be fast fixed. | ||
2768 | */ | ||
2769 | if (!spte_is_locklessly_modifiable(spte)) | ||
2770 | goto exit; | ||
2771 | |||
2772 | /* | ||
2773 | * Currently, fast page fault only works for direct mapping since | ||
2774 | * the gfn is not stable for indirect shadow page. | ||
2775 | * See Documentation/virtual/kvm/locking.txt to get more detail. | ||
2776 | */ | ||
2777 | ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); | ||
2778 | exit: | ||
2779 | walk_shadow_page_lockless_end(vcpu); | ||
2780 | |||
2781 | return ret; | ||
2782 | } | ||
2783 | |||
2680 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2784 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2681 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2785 | gva_t gva, pfn_t *pfn, bool write, bool *writable); |
2682 | 2786 | ||
2683 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | 2787 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
2684 | bool prefault) | 2788 | gfn_t gfn, bool prefault) |
2685 | { | 2789 | { |
2686 | int r; | 2790 | int r; |
2687 | int level; | 2791 | int level; |
2688 | int force_pt_level; | 2792 | int force_pt_level; |
2689 | pfn_t pfn; | 2793 | pfn_t pfn; |
2690 | unsigned long mmu_seq; | 2794 | unsigned long mmu_seq; |
2691 | bool map_writable; | 2795 | bool map_writable, write = error_code & PFERR_WRITE_MASK; |
2692 | 2796 | ||
2693 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); | 2797 | force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); |
2694 | if (likely(!force_pt_level)) { | 2798 | if (likely(!force_pt_level)) { |
@@ -2705,6 +2809,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, | |||
2705 | } else | 2809 | } else |
2706 | level = PT_PAGE_TABLE_LEVEL; | 2810 | level = PT_PAGE_TABLE_LEVEL; |
2707 | 2811 | ||
2812 | if (fast_page_fault(vcpu, v, level, error_code)) | ||
2813 | return 0; | ||
2814 | |||
2708 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 2815 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
2709 | smp_rmb(); | 2816 | smp_rmb(); |
2710 | 2817 | ||
@@ -3093,7 +3200,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3093 | gfn = gva >> PAGE_SHIFT; | 3200 | gfn = gva >> PAGE_SHIFT; |
3094 | 3201 | ||
3095 | return nonpaging_map(vcpu, gva & PAGE_MASK, | 3202 | return nonpaging_map(vcpu, gva & PAGE_MASK, |
3096 | error_code & PFERR_WRITE_MASK, gfn, prefault); | 3203 | error_code, gfn, prefault); |
3097 | } | 3204 | } |
3098 | 3205 | ||
3099 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) | 3206 | static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) |
@@ -3173,6 +3280,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3173 | } else | 3280 | } else |
3174 | level = PT_PAGE_TABLE_LEVEL; | 3281 | level = PT_PAGE_TABLE_LEVEL; |
3175 | 3282 | ||
3283 | if (fast_page_fault(vcpu, gpa, level, error_code)) | ||
3284 | return 0; | ||
3285 | |||
3176 | mmu_seq = vcpu->kvm->mmu_notifier_seq; | 3286 | mmu_seq = vcpu->kvm->mmu_notifier_seq; |
3177 | smp_rmb(); | 3287 | smp_rmb(); |
3178 | 3288 | ||