diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 301 |
1 files changed, 245 insertions, 56 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781c..0d094da49541 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) | |||
197 | } | 197 | } |
198 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); | 198 | EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); |
199 | 199 | ||
200 | static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) | 200 | /* |
201 | * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, | ||
202 | * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation | ||
203 | * number. | ||
204 | */ | ||
205 | #define MMIO_SPTE_GEN_LOW_SHIFT 3 | ||
206 | #define MMIO_SPTE_GEN_HIGH_SHIFT 52 | ||
207 | |||
208 | #define MMIO_GEN_SHIFT 19 | ||
209 | #define MMIO_GEN_LOW_SHIFT 9 | ||
210 | #define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) | ||
211 | #define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) | ||
212 | #define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) | ||
213 | |||
214 | static u64 generation_mmio_spte_mask(unsigned int gen) | ||
201 | { | 215 | { |
202 | struct kvm_mmu_page *sp = page_header(__pa(sptep)); | 216 | u64 mask; |
217 | |||
218 | WARN_ON(gen > MMIO_MAX_GEN); | ||
219 | |||
220 | mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; | ||
221 | mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; | ||
222 | return mask; | ||
223 | } | ||
224 | |||
225 | static unsigned int get_mmio_spte_generation(u64 spte) | ||
226 | { | ||
227 | unsigned int gen; | ||
228 | |||
229 | spte &= ~shadow_mmio_mask; | ||
230 | |||
231 | gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; | ||
232 | gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; | ||
233 | return gen; | ||
234 | } | ||
235 | |||
236 | static unsigned int kvm_current_mmio_generation(struct kvm *kvm) | ||
237 | { | ||
238 | /* | ||
239 | * Init kvm generation close to MMIO_MAX_GEN to easily test the | ||
240 | * code of handling generation number wrap-around. | ||
241 | */ | ||
242 | return (kvm_memslots(kvm)->generation + | ||
243 | MMIO_MAX_GEN - 150) & MMIO_GEN_MASK; | ||
244 | } | ||
245 | |||
246 | static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, | ||
247 | unsigned access) | ||
248 | { | ||
249 | unsigned int gen = kvm_current_mmio_generation(kvm); | ||
250 | u64 mask = generation_mmio_spte_mask(gen); | ||
203 | 251 | ||
204 | access &= ACC_WRITE_MASK | ACC_USER_MASK; | 252 | access &= ACC_WRITE_MASK | ACC_USER_MASK; |
253 | mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; | ||
205 | 254 | ||
206 | sp->mmio_cached = true; | 255 | trace_mark_mmio_spte(sptep, gfn, access, gen); |
207 | trace_mark_mmio_spte(sptep, gfn, access); | 256 | mmu_spte_set(sptep, mask); |
208 | mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); | ||
209 | } | 257 | } |
210 | 258 | ||
211 | static bool is_mmio_spte(u64 spte) | 259 | static bool is_mmio_spte(u64 spte) |
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte) | |||
215 | 263 | ||
216 | static gfn_t get_mmio_spte_gfn(u64 spte) | 264 | static gfn_t get_mmio_spte_gfn(u64 spte) |
217 | { | 265 | { |
218 | return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; | 266 | u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; |
267 | return (spte & ~mask) >> PAGE_SHIFT; | ||
219 | } | 268 | } |
220 | 269 | ||
221 | static unsigned get_mmio_spte_access(u64 spte) | 270 | static unsigned get_mmio_spte_access(u64 spte) |
222 | { | 271 | { |
223 | return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; | 272 | u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; |
273 | return (spte & ~mask) & ~PAGE_MASK; | ||
224 | } | 274 | } |
225 | 275 | ||
226 | static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) | 276 | static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, |
277 | pfn_t pfn, unsigned access) | ||
227 | { | 278 | { |
228 | if (unlikely(is_noslot_pfn(pfn))) { | 279 | if (unlikely(is_noslot_pfn(pfn))) { |
229 | mark_mmio_spte(sptep, gfn, access); | 280 | mark_mmio_spte(kvm, sptep, gfn, access); |
230 | return true; | 281 | return true; |
231 | } | 282 | } |
232 | 283 | ||
233 | return false; | 284 | return false; |
234 | } | 285 | } |
235 | 286 | ||
287 | static bool check_mmio_spte(struct kvm *kvm, u64 spte) | ||
288 | { | ||
289 | unsigned int kvm_gen, spte_gen; | ||
290 | |||
291 | kvm_gen = kvm_current_mmio_generation(kvm); | ||
292 | spte_gen = get_mmio_spte_generation(spte); | ||
293 | |||
294 | trace_check_mmio_spte(spte, kvm_gen, spte_gen); | ||
295 | return likely(kvm_gen == spte_gen); | ||
296 | } | ||
297 | |||
236 | static inline u64 rsvd_bits(int s, int e) | 298 | static inline u64 rsvd_bits(int s, int e) |
237 | { | 299 | { |
238 | return ((1ULL << (e - s + 1)) - 1) << s; | 300 | return ((1ULL << (e - s + 1)) - 1) << s; |
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) | |||
404 | /* | 466 | /* |
405 | * The idea using the light way get the spte on x86_32 guest is from | 467 | * The idea using the light way get the spte on x86_32 guest is from |
406 | * gup_get_pte(arch/x86/mm/gup.c). | 468 | * gup_get_pte(arch/x86/mm/gup.c). |
407 | * The difference is we can not catch the spte tlb flush if we leave | 469 | * |
408 | * guest mode, so we emulate it by increase clear_spte_count when spte | 470 | * An spte tlb flush may be pending, because kvm_set_pte_rmapp |
409 | * is cleared. | 471 | * coalesces them and we are running out of the MMU lock. Therefore |
472 | * we need to protect against in-progress updates of the spte. | ||
473 | * | ||
474 | * Reading the spte while an update is in progress may get the old value | ||
475 | * for the high part of the spte. The race is fine for a present->non-present | ||
476 | * change (because the high part of the spte is ignored for non-present spte), | ||
477 | * but for a present->present change we must reread the spte. | ||
478 | * | ||
479 | * All such changes are done in two steps (present->non-present and | ||
480 | * non-present->present), hence it is enough to count the number of | ||
481 | * present->non-present updates: if it changed while reading the spte, | ||
482 | * we might have hit the race. This is done using clear_spte_count. | ||
410 | */ | 483 | */ |
411 | static u64 __get_spte_lockless(u64 *sptep) | 484 | static u64 __get_spte_lockless(u64 *sptep) |
412 | { | 485 | { |
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, | |||
1511 | if (!direct) | 1584 | if (!direct) |
1512 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); | 1585 | sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); |
1513 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); | 1586 | set_page_private(virt_to_page(sp->spt), (unsigned long)sp); |
1587 | |||
1588 | /* | ||
1589 | * The active_mmu_pages list is the FIFO list, do not move the | ||
1590 | * page until it is zapped. kvm_zap_obsolete_pages depends on | ||
1591 | * this feature. See the comments in kvm_zap_obsolete_pages(). | ||
1592 | */ | ||
1514 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); | 1593 | list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); |
1515 | sp->parent_ptes = 0; | 1594 | sp->parent_ptes = 0; |
1516 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); | 1595 | mmu_page_add_parent_pte(vcpu, sp, parent_pte); |
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
1648 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, | 1727 | static void kvm_mmu_commit_zap_page(struct kvm *kvm, |
1649 | struct list_head *invalid_list); | 1728 | struct list_head *invalid_list); |
1650 | 1729 | ||
1730 | /* | ||
1731 | * NOTE: we should pay more attention on the zapped-obsolete page | ||
1732 | * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk | ||
1733 | * since it has been deleted from active_mmu_pages but still can be found | ||
1734 | * at hast list. | ||
1735 | * | ||
1736 | * for_each_gfn_indirect_valid_sp has skipped that kind of page and | ||
1737 | * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped | ||
1738 | * all the obsolete pages. | ||
1739 | */ | ||
1651 | #define for_each_gfn_sp(_kvm, _sp, _gfn) \ | 1740 | #define for_each_gfn_sp(_kvm, _sp, _gfn) \ |
1652 | hlist_for_each_entry(_sp, \ | 1741 | hlist_for_each_entry(_sp, \ |
1653 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ | 1742 | &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ |
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte) | |||
1838 | __clear_sp_write_flooding_count(sp); | 1927 | __clear_sp_write_flooding_count(sp); |
1839 | } | 1928 | } |
1840 | 1929 | ||
1930 | static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) | ||
1931 | { | ||
1932 | return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); | ||
1933 | } | ||
1934 | |||
1841 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | 1935 | static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, |
1842 | gfn_t gfn, | 1936 | gfn_t gfn, |
1843 | gva_t gaddr, | 1937 | gva_t gaddr, |
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1864 | role.quadrant = quadrant; | 1958 | role.quadrant = quadrant; |
1865 | } | 1959 | } |
1866 | for_each_gfn_sp(vcpu->kvm, sp, gfn) { | 1960 | for_each_gfn_sp(vcpu->kvm, sp, gfn) { |
1961 | if (is_obsolete_sp(vcpu->kvm, sp)) | ||
1962 | continue; | ||
1963 | |||
1867 | if (!need_sync && sp->unsync) | 1964 | if (!need_sync && sp->unsync) |
1868 | need_sync = true; | 1965 | need_sync = true; |
1869 | 1966 | ||
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, | |||
1900 | 1997 | ||
1901 | account_shadowed(vcpu->kvm, gfn); | 1998 | account_shadowed(vcpu->kvm, gfn); |
1902 | } | 1999 | } |
2000 | sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; | ||
1903 | init_shadow_page_table(sp); | 2001 | init_shadow_page_table(sp); |
1904 | trace_kvm_mmu_get_page(sp, true); | 2002 | trace_kvm_mmu_get_page(sp, true); |
1905 | return sp; | 2003 | return sp; |
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
2070 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); | 2168 | ret = mmu_zap_unsync_children(kvm, sp, invalid_list); |
2071 | kvm_mmu_page_unlink_children(kvm, sp); | 2169 | kvm_mmu_page_unlink_children(kvm, sp); |
2072 | kvm_mmu_unlink_parents(kvm, sp); | 2170 | kvm_mmu_unlink_parents(kvm, sp); |
2171 | |||
2073 | if (!sp->role.invalid && !sp->role.direct) | 2172 | if (!sp->role.invalid && !sp->role.direct) |
2074 | unaccount_shadowed(kvm, sp->gfn); | 2173 | unaccount_shadowed(kvm, sp->gfn); |
2174 | |||
2075 | if (sp->unsync) | 2175 | if (sp->unsync) |
2076 | kvm_unlink_unsync_page(kvm, sp); | 2176 | kvm_unlink_unsync_page(kvm, sp); |
2077 | if (!sp->root_count) { | 2177 | if (!sp->root_count) { |
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, | |||
2081 | kvm_mod_used_mmu_pages(kvm, -1); | 2181 | kvm_mod_used_mmu_pages(kvm, -1); |
2082 | } else { | 2182 | } else { |
2083 | list_move(&sp->link, &kvm->arch.active_mmu_pages); | 2183 | list_move(&sp->link, &kvm->arch.active_mmu_pages); |
2084 | kvm_reload_remote_mmus(kvm); | 2184 | |
2185 | /* | ||
2186 | * The obsolete pages can not be used on any vcpus. | ||
2187 | * See the comments in kvm_mmu_invalidate_zap_all_pages(). | ||
2188 | */ | ||
2189 | if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) | ||
2190 | kvm_reload_remote_mmus(kvm); | ||
2085 | } | 2191 | } |
2086 | 2192 | ||
2087 | sp->role.invalid = 1; | 2193 | sp->role.invalid = 1; |
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | |||
2331 | u64 spte; | 2437 | u64 spte; |
2332 | int ret = 0; | 2438 | int ret = 0; |
2333 | 2439 | ||
2334 | if (set_mmio_spte(sptep, gfn, pfn, pte_access)) | 2440 | if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) |
2335 | return 0; | 2441 | return 0; |
2336 | 2442 | ||
2337 | spte = PT_PRESENT_MASK; | 2443 | spte = PT_PRESENT_MASK; |
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) | |||
2869 | 2975 | ||
2870 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) | 2976 | if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) |
2871 | return; | 2977 | return; |
2872 | spin_lock(&vcpu->kvm->mmu_lock); | 2978 | |
2873 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && | 2979 | if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && |
2874 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || | 2980 | (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || |
2875 | vcpu->arch.mmu.direct_map)) { | 2981 | vcpu->arch.mmu.direct_map)) { |
2876 | hpa_t root = vcpu->arch.mmu.root_hpa; | 2982 | hpa_t root = vcpu->arch.mmu.root_hpa; |
2877 | 2983 | ||
2984 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2878 | sp = page_header(root); | 2985 | sp = page_header(root); |
2879 | --sp->root_count; | 2986 | --sp->root_count; |
2880 | if (!sp->root_count && sp->role.invalid) { | 2987 | if (!sp->root_count && sp->role.invalid) { |
2881 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); | 2988 | kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); |
2882 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); | 2989 | kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); |
2883 | } | 2990 | } |
2884 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
2885 | spin_unlock(&vcpu->kvm->mmu_lock); | 2991 | spin_unlock(&vcpu->kvm->mmu_lock); |
2992 | vcpu->arch.mmu.root_hpa = INVALID_PAGE; | ||
2886 | return; | 2993 | return; |
2887 | } | 2994 | } |
2995 | |||
2996 | spin_lock(&vcpu->kvm->mmu_lock); | ||
2888 | for (i = 0; i < 4; ++i) { | 2997 | for (i = 0; i < 4; ++i) { |
2889 | hpa_t root = vcpu->arch.mmu.pae_root[i]; | 2998 | hpa_t root = vcpu->arch.mmu.pae_root[i]; |
2890 | 2999 | ||
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) | |||
3148 | return spte; | 3257 | return spte; |
3149 | } | 3258 | } |
3150 | 3259 | ||
3151 | /* | ||
3152 | * If it is a real mmio page fault, return 1 and emulat the instruction | ||
3153 | * directly, return 0 to let CPU fault again on the address, -1 is | ||
3154 | * returned if bug is detected. | ||
3155 | */ | ||
3156 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | 3260 | int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) |
3157 | { | 3261 | { |
3158 | u64 spte; | 3262 | u64 spte; |
3159 | 3263 | ||
3160 | if (quickly_check_mmio_pf(vcpu, addr, direct)) | 3264 | if (quickly_check_mmio_pf(vcpu, addr, direct)) |
3161 | return 1; | 3265 | return RET_MMIO_PF_EMULATE; |
3162 | 3266 | ||
3163 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); | 3267 | spte = walk_shadow_page_get_mmio_spte(vcpu, addr); |
3164 | 3268 | ||
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |||
3166 | gfn_t gfn = get_mmio_spte_gfn(spte); | 3270 | gfn_t gfn = get_mmio_spte_gfn(spte); |
3167 | unsigned access = get_mmio_spte_access(spte); | 3271 | unsigned access = get_mmio_spte_access(spte); |
3168 | 3272 | ||
3273 | if (!check_mmio_spte(vcpu->kvm, spte)) | ||
3274 | return RET_MMIO_PF_INVALID; | ||
3275 | |||
3169 | if (direct) | 3276 | if (direct) |
3170 | addr = 0; | 3277 | addr = 0; |
3171 | 3278 | ||
3172 | trace_handle_mmio_page_fault(addr, gfn, access); | 3279 | trace_handle_mmio_page_fault(addr, gfn, access); |
3173 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); | 3280 | vcpu_cache_mmio_info(vcpu, addr, gfn, access); |
3174 | return 1; | 3281 | return RET_MMIO_PF_EMULATE; |
3175 | } | 3282 | } |
3176 | 3283 | ||
3177 | /* | 3284 | /* |
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) | |||
3179 | * it's a BUG if the gfn is not a mmio page. | 3286 | * it's a BUG if the gfn is not a mmio page. |
3180 | */ | 3287 | */ |
3181 | if (direct && !check_direct_spte_mmio_pf(spte)) | 3288 | if (direct && !check_direct_spte_mmio_pf(spte)) |
3182 | return -1; | 3289 | return RET_MMIO_PF_BUG; |
3183 | 3290 | ||
3184 | /* | 3291 | /* |
3185 | * If the page table is zapped by other cpus, let CPU fault again on | 3292 | * If the page table is zapped by other cpus, let CPU fault again on |
3186 | * the address. | 3293 | * the address. |
3187 | */ | 3294 | */ |
3188 | return 0; | 3295 | return RET_MMIO_PF_RETRY; |
3189 | } | 3296 | } |
3190 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); | 3297 | EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); |
3191 | 3298 | ||
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, | |||
3195 | int ret; | 3302 | int ret; |
3196 | 3303 | ||
3197 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); | 3304 | ret = handle_mmio_page_fault_common(vcpu, addr, direct); |
3198 | WARN_ON(ret < 0); | 3305 | WARN_ON(ret == RET_MMIO_PF_BUG); |
3199 | return ret; | 3306 | return ret; |
3200 | } | 3307 | } |
3201 | 3308 | ||
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, | |||
3207 | 3314 | ||
3208 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); | 3315 | pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); |
3209 | 3316 | ||
3210 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 3317 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
3211 | return handle_mmio_page_fault(vcpu, gva, error_code, true); | 3318 | r = handle_mmio_page_fault(vcpu, gva, error_code, true); |
3319 | |||
3320 | if (likely(r != RET_MMIO_PF_INVALID)) | ||
3321 | return r; | ||
3322 | } | ||
3212 | 3323 | ||
3213 | r = mmu_topup_memory_caches(vcpu); | 3324 | r = mmu_topup_memory_caches(vcpu); |
3214 | if (r) | 3325 | if (r) |
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | |||
3284 | ASSERT(vcpu); | 3395 | ASSERT(vcpu); |
3285 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); | 3396 | ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); |
3286 | 3397 | ||
3287 | if (unlikely(error_code & PFERR_RSVD_MASK)) | 3398 | if (unlikely(error_code & PFERR_RSVD_MASK)) { |
3288 | return handle_mmio_page_fault(vcpu, gpa, error_code, true); | 3399 | r = handle_mmio_page_fault(vcpu, gpa, error_code, true); |
3400 | |||
3401 | if (likely(r != RET_MMIO_PF_INVALID)) | ||
3402 | return r; | ||
3403 | } | ||
3289 | 3404 | ||
3290 | r = mmu_topup_memory_caches(vcpu); | 3405 | r = mmu_topup_memory_caches(vcpu); |
3291 | if (r) | 3406 | if (r) |
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte) | |||
3391 | *access &= mask; | 3506 | *access &= mask; |
3392 | } | 3507 | } |
3393 | 3508 | ||
3394 | static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | 3509 | static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, |
3395 | int *nr_present) | 3510 | unsigned access, int *nr_present) |
3396 | { | 3511 | { |
3397 | if (unlikely(is_mmio_spte(*sptep))) { | 3512 | if (unlikely(is_mmio_spte(*sptep))) { |
3398 | if (gfn != get_mmio_spte_gfn(*sptep)) { | 3513 | if (gfn != get_mmio_spte_gfn(*sptep)) { |
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, | |||
3401 | } | 3516 | } |
3402 | 3517 | ||
3403 | (*nr_present)++; | 3518 | (*nr_present)++; |
3404 | mark_mmio_spte(sptep, gfn, access); | 3519 | mark_mmio_spte(kvm, sptep, gfn, access); |
3405 | return true; | 3520 | return true; |
3406 | } | 3521 | } |
3407 | 3522 | ||
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) | |||
3764 | if (r) | 3879 | if (r) |
3765 | goto out; | 3880 | goto out; |
3766 | r = mmu_alloc_roots(vcpu); | 3881 | r = mmu_alloc_roots(vcpu); |
3767 | spin_lock(&vcpu->kvm->mmu_lock); | 3882 | kvm_mmu_sync_roots(vcpu); |
3768 | mmu_sync_roots(vcpu); | ||
3769 | spin_unlock(&vcpu->kvm->mmu_lock); | ||
3770 | if (r) | 3883 | if (r) |
3771 | goto out; | 3884 | goto out; |
3772 | /* set_cr3() should ensure TLB has been flushed */ | 3885 | /* set_cr3() should ensure TLB has been flushed */ |
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) | |||
4179 | spin_unlock(&kvm->mmu_lock); | 4292 | spin_unlock(&kvm->mmu_lock); |
4180 | } | 4293 | } |
4181 | 4294 | ||
4182 | void kvm_mmu_zap_all(struct kvm *kvm) | 4295 | #define BATCH_ZAP_PAGES 10 |
4296 | static void kvm_zap_obsolete_pages(struct kvm *kvm) | ||
4183 | { | 4297 | { |
4184 | struct kvm_mmu_page *sp, *node; | 4298 | struct kvm_mmu_page *sp, *node; |
4185 | LIST_HEAD(invalid_list); | 4299 | int batch = 0; |
4186 | 4300 | ||
4187 | spin_lock(&kvm->mmu_lock); | ||
4188 | restart: | 4301 | restart: |
4189 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) | 4302 | list_for_each_entry_safe_reverse(sp, node, |
4190 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) | 4303 | &kvm->arch.active_mmu_pages, link) { |
4304 | int ret; | ||
4305 | |||
4306 | /* | ||
4307 | * No obsolete page exists before new created page since | ||
4308 | * active_mmu_pages is the FIFO list. | ||
4309 | */ | ||
4310 | if (!is_obsolete_sp(kvm, sp)) | ||
4311 | break; | ||
4312 | |||
4313 | /* | ||
4314 | * Since we are reversely walking the list and the invalid | ||
4315 | * list will be moved to the head, skip the invalid page | ||
4316 | * can help us to avoid the infinity list walking. | ||
4317 | */ | ||
4318 | if (sp->role.invalid) | ||
4319 | continue; | ||
4320 | |||
4321 | /* | ||
4322 | * Need not flush tlb since we only zap the sp with invalid | ||
4323 | * generation number. | ||
4324 | */ | ||
4325 | if (batch >= BATCH_ZAP_PAGES && | ||
4326 | cond_resched_lock(&kvm->mmu_lock)) { | ||
4327 | batch = 0; | ||
4328 | goto restart; | ||
4329 | } | ||
4330 | |||
4331 | ret = kvm_mmu_prepare_zap_page(kvm, sp, | ||
4332 | &kvm->arch.zapped_obsolete_pages); | ||
4333 | batch += ret; | ||
4334 | |||
4335 | if (ret) | ||
4191 | goto restart; | 4336 | goto restart; |
4337 | } | ||
4192 | 4338 | ||
4193 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4339 | /* |
4194 | spin_unlock(&kvm->mmu_lock); | 4340 | * Should flush tlb before free page tables since lockless-walking |
4341 | * may use the pages. | ||
4342 | */ | ||
4343 | kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); | ||
4195 | } | 4344 | } |
4196 | 4345 | ||
4197 | void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) | 4346 | /* |
4347 | * Fast invalidate all shadow pages and use lock-break technique | ||
4348 | * to zap obsolete pages. | ||
4349 | * | ||
4350 | * It's required when memslot is being deleted or VM is being | ||
4351 | * destroyed, in these cases, we should ensure that KVM MMU does | ||
4352 | * not use any resource of the being-deleted slot or all slots | ||
4353 | * after calling the function. | ||
4354 | */ | ||
4355 | void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) | ||
4198 | { | 4356 | { |
4199 | struct kvm_mmu_page *sp, *node; | ||
4200 | LIST_HEAD(invalid_list); | ||
4201 | |||
4202 | spin_lock(&kvm->mmu_lock); | 4357 | spin_lock(&kvm->mmu_lock); |
4203 | restart: | 4358 | trace_kvm_mmu_invalidate_zap_all_pages(kvm); |
4204 | list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { | 4359 | kvm->arch.mmu_valid_gen++; |
4205 | if (!sp->mmio_cached) | ||
4206 | continue; | ||
4207 | if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) | ||
4208 | goto restart; | ||
4209 | } | ||
4210 | 4360 | ||
4211 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4361 | /* |
4362 | * Notify all vcpus to reload its shadow page table | ||
4363 | * and flush TLB. Then all vcpus will switch to new | ||
4364 | * shadow page table with the new mmu_valid_gen. | ||
4365 | * | ||
4366 | * Note: we should do this under the protection of | ||
4367 | * mmu-lock, otherwise, vcpu would purge shadow page | ||
4368 | * but miss tlb flush. | ||
4369 | */ | ||
4370 | kvm_reload_remote_mmus(kvm); | ||
4371 | |||
4372 | kvm_zap_obsolete_pages(kvm); | ||
4212 | spin_unlock(&kvm->mmu_lock); | 4373 | spin_unlock(&kvm->mmu_lock); |
4213 | } | 4374 | } |
4214 | 4375 | ||
4376 | static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) | ||
4377 | { | ||
4378 | return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); | ||
4379 | } | ||
4380 | |||
4381 | void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) | ||
4382 | { | ||
4383 | /* | ||
4384 | * The very rare case: if the generation-number is round, | ||
4385 | * zap all shadow pages. | ||
4386 | * | ||
4387 | * The max value is MMIO_MAX_GEN - 1 since it is not called | ||
4388 | * when mark memslot invalid. | ||
4389 | */ | ||
4390 | if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) { | ||
4391 | printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n"); | ||
4392 | kvm_mmu_invalidate_zap_all_pages(kvm); | ||
4393 | } | ||
4394 | } | ||
4395 | |||
4215 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | 4396 | static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) |
4216 | { | 4397 | { |
4217 | struct kvm *kvm; | 4398 | struct kvm *kvm; |
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) | |||
4240 | * want to shrink a VM that only started to populate its MMU | 4421 | * want to shrink a VM that only started to populate its MMU |
4241 | * anyway. | 4422 | * anyway. |
4242 | */ | 4423 | */ |
4243 | if (!kvm->arch.n_used_mmu_pages) | 4424 | if (!kvm->arch.n_used_mmu_pages && |
4425 | !kvm_has_zapped_obsolete_pages(kvm)) | ||
4244 | continue; | 4426 | continue; |
4245 | 4427 | ||
4246 | idx = srcu_read_lock(&kvm->srcu); | 4428 | idx = srcu_read_lock(&kvm->srcu); |
4247 | spin_lock(&kvm->mmu_lock); | 4429 | spin_lock(&kvm->mmu_lock); |
4248 | 4430 | ||
4431 | if (kvm_has_zapped_obsolete_pages(kvm)) { | ||
4432 | kvm_mmu_commit_zap_page(kvm, | ||
4433 | &kvm->arch.zapped_obsolete_pages); | ||
4434 | goto unlock; | ||
4435 | } | ||
4436 | |||
4249 | prepare_zap_oldest_mmu_page(kvm, &invalid_list); | 4437 | prepare_zap_oldest_mmu_page(kvm, &invalid_list); |
4250 | kvm_mmu_commit_zap_page(kvm, &invalid_list); | 4438 | kvm_mmu_commit_zap_page(kvm, &invalid_list); |
4251 | 4439 | ||
4440 | unlock: | ||
4252 | spin_unlock(&kvm->mmu_lock); | 4441 | spin_unlock(&kvm->mmu_lock); |
4253 | srcu_read_unlock(&kvm->srcu, idx); | 4442 | srcu_read_unlock(&kvm->srcu, idx); |
4254 | 4443 | ||