aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c301
1 files changed, 245 insertions, 56 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 004cc87b781c..0d094da49541 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
197} 197}
198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 198EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
199 199
200static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) 200/*
201 * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
202 * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
203 * number.
204 */
205#define MMIO_SPTE_GEN_LOW_SHIFT 3
206#define MMIO_SPTE_GEN_HIGH_SHIFT 52
207
208#define MMIO_GEN_SHIFT 19
209#define MMIO_GEN_LOW_SHIFT 9
210#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1)
211#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
212#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
213
214static u64 generation_mmio_spte_mask(unsigned int gen)
201{ 215{
202 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 216 u64 mask;
217
218 WARN_ON(gen > MMIO_MAX_GEN);
219
220 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
221 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
222 return mask;
223}
224
225static unsigned int get_mmio_spte_generation(u64 spte)
226{
227 unsigned int gen;
228
229 spte &= ~shadow_mmio_mask;
230
231 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
232 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
233 return gen;
234}
235
236static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
237{
238 /*
239 * Init kvm generation close to MMIO_MAX_GEN to easily test the
240 * code of handling generation number wrap-around.
241 */
242 return (kvm_memslots(kvm)->generation +
243 MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
244}
245
246static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
247 unsigned access)
248{
249 unsigned int gen = kvm_current_mmio_generation(kvm);
250 u64 mask = generation_mmio_spte_mask(gen);
203 251
204 access &= ACC_WRITE_MASK | ACC_USER_MASK; 252 access &= ACC_WRITE_MASK | ACC_USER_MASK;
253 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
205 254
206 sp->mmio_cached = true; 255 trace_mark_mmio_spte(sptep, gfn, access, gen);
207 trace_mark_mmio_spte(sptep, gfn, access); 256 mmu_spte_set(sptep, mask);
208 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
209} 257}
210 258
211static bool is_mmio_spte(u64 spte) 259static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
215 263
216static gfn_t get_mmio_spte_gfn(u64 spte) 264static gfn_t get_mmio_spte_gfn(u64 spte)
217{ 265{
218 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; 266 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
267 return (spte & ~mask) >> PAGE_SHIFT;
219} 268}
220 269
221static unsigned get_mmio_spte_access(u64 spte) 270static unsigned get_mmio_spte_access(u64 spte)
222{ 271{
223 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; 272 u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
273 return (spte & ~mask) & ~PAGE_MASK;
224} 274}
225 275
226static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) 276static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
277 pfn_t pfn, unsigned access)
227{ 278{
228 if (unlikely(is_noslot_pfn(pfn))) { 279 if (unlikely(is_noslot_pfn(pfn))) {
229 mark_mmio_spte(sptep, gfn, access); 280 mark_mmio_spte(kvm, sptep, gfn, access);
230 return true; 281 return true;
231 } 282 }
232 283
233 return false; 284 return false;
234} 285}
235 286
287static bool check_mmio_spte(struct kvm *kvm, u64 spte)
288{
289 unsigned int kvm_gen, spte_gen;
290
291 kvm_gen = kvm_current_mmio_generation(kvm);
292 spte_gen = get_mmio_spte_generation(spte);
293
294 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
295 return likely(kvm_gen == spte_gen);
296}
297
236static inline u64 rsvd_bits(int s, int e) 298static inline u64 rsvd_bits(int s, int e)
237{ 299{
238 return ((1ULL << (e - s + 1)) - 1) << s; 300 return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
404/* 466/*
405 * The idea using the light way get the spte on x86_32 guest is from 467 * The idea using the light way get the spte on x86_32 guest is from
406 * gup_get_pte(arch/x86/mm/gup.c). 468 * gup_get_pte(arch/x86/mm/gup.c).
407 * The difference is we can not catch the spte tlb flush if we leave 469 *
408 * guest mode, so we emulate it by increase clear_spte_count when spte 470 * An spte tlb flush may be pending, because kvm_set_pte_rmapp
409 * is cleared. 471 * coalesces them and we are running out of the MMU lock. Therefore
472 * we need to protect against in-progress updates of the spte.
473 *
474 * Reading the spte while an update is in progress may get the old value
475 * for the high part of the spte. The race is fine for a present->non-present
476 * change (because the high part of the spte is ignored for non-present spte),
477 * but for a present->present change we must reread the spte.
478 *
479 * All such changes are done in two steps (present->non-present and
480 * non-present->present), hence it is enough to count the number of
481 * present->non-present updates: if it changed while reading the spte,
482 * we might have hit the race. This is done using clear_spte_count.
410 */ 483 */
411static u64 __get_spte_lockless(u64 *sptep) 484static u64 __get_spte_lockless(u64 *sptep)
412{ 485{
@@ -1511,6 +1584,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1511 if (!direct) 1584 if (!direct)
1512 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1585 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
1513 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1586 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1587
1588 /*
1589 * The active_mmu_pages list is the FIFO list, do not move the
1590 * page until it is zapped. kvm_zap_obsolete_pages depends on
1591 * this feature. See the comments in kvm_zap_obsolete_pages().
1592 */
1514 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1593 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1515 sp->parent_ptes = 0; 1594 sp->parent_ptes = 0;
1516 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1595 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
@@ -1648,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1648static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1727static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1649 struct list_head *invalid_list); 1728 struct list_head *invalid_list);
1650 1729
1730/*
1731 * NOTE: we should pay more attention on the zapped-obsolete page
1732 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
1733 * since it has been deleted from active_mmu_pages but still can be found
1734 * at hast list.
1735 *
1736 * for_each_gfn_indirect_valid_sp has skipped that kind of page and
1737 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
1738 * all the obsolete pages.
1739 */
1651#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1740#define for_each_gfn_sp(_kvm, _sp, _gfn) \
1652 hlist_for_each_entry(_sp, \ 1741 hlist_for_each_entry(_sp, \
1653 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1742 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1838,6 +1927,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
1838 __clear_sp_write_flooding_count(sp); 1927 __clear_sp_write_flooding_count(sp);
1839} 1928}
1840 1929
1930static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
1931{
1932 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
1933}
1934
1841static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1935static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1842 gfn_t gfn, 1936 gfn_t gfn,
1843 gva_t gaddr, 1937 gva_t gaddr,
@@ -1864,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1864 role.quadrant = quadrant; 1958 role.quadrant = quadrant;
1865 } 1959 }
1866 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 1960 for_each_gfn_sp(vcpu->kvm, sp, gfn) {
1961 if (is_obsolete_sp(vcpu->kvm, sp))
1962 continue;
1963
1867 if (!need_sync && sp->unsync) 1964 if (!need_sync && sp->unsync)
1868 need_sync = true; 1965 need_sync = true;
1869 1966
@@ -1900,6 +1997,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1900 1997
1901 account_shadowed(vcpu->kvm, gfn); 1998 account_shadowed(vcpu->kvm, gfn);
1902 } 1999 }
2000 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
1903 init_shadow_page_table(sp); 2001 init_shadow_page_table(sp);
1904 trace_kvm_mmu_get_page(sp, true); 2002 trace_kvm_mmu_get_page(sp, true);
1905 return sp; 2003 return sp;
@@ -2070,8 +2168,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2070 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2168 ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
2071 kvm_mmu_page_unlink_children(kvm, sp); 2169 kvm_mmu_page_unlink_children(kvm, sp);
2072 kvm_mmu_unlink_parents(kvm, sp); 2170 kvm_mmu_unlink_parents(kvm, sp);
2171
2073 if (!sp->role.invalid && !sp->role.direct) 2172 if (!sp->role.invalid && !sp->role.direct)
2074 unaccount_shadowed(kvm, sp->gfn); 2173 unaccount_shadowed(kvm, sp->gfn);
2174
2075 if (sp->unsync) 2175 if (sp->unsync)
2076 kvm_unlink_unsync_page(kvm, sp); 2176 kvm_unlink_unsync_page(kvm, sp);
2077 if (!sp->root_count) { 2177 if (!sp->root_count) {
@@ -2081,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2081 kvm_mod_used_mmu_pages(kvm, -1); 2181 kvm_mod_used_mmu_pages(kvm, -1);
2082 } else { 2182 } else {
2083 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2183 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2084 kvm_reload_remote_mmus(kvm); 2184
2185 /*
2186 * The obsolete pages can not be used on any vcpus.
2187 * See the comments in kvm_mmu_invalidate_zap_all_pages().
2188 */
2189 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
2190 kvm_reload_remote_mmus(kvm);
2085 } 2191 }
2086 2192
2087 sp->role.invalid = 1; 2193 sp->role.invalid = 1;
@@ -2331,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2331 u64 spte; 2437 u64 spte;
2332 int ret = 0; 2438 int ret = 0;
2333 2439
2334 if (set_mmio_spte(sptep, gfn, pfn, pte_access)) 2440 if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
2335 return 0; 2441 return 0;
2336 2442
2337 spte = PT_PRESENT_MASK; 2443 spte = PT_PRESENT_MASK;
@@ -2869,22 +2975,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2869 2975
2870 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2976 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2871 return; 2977 return;
2872 spin_lock(&vcpu->kvm->mmu_lock); 2978
2873 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 2979 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2874 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 2980 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2875 vcpu->arch.mmu.direct_map)) { 2981 vcpu->arch.mmu.direct_map)) {
2876 hpa_t root = vcpu->arch.mmu.root_hpa; 2982 hpa_t root = vcpu->arch.mmu.root_hpa;
2877 2983
2984 spin_lock(&vcpu->kvm->mmu_lock);
2878 sp = page_header(root); 2985 sp = page_header(root);
2879 --sp->root_count; 2986 --sp->root_count;
2880 if (!sp->root_count && sp->role.invalid) { 2987 if (!sp->root_count && sp->role.invalid) {
2881 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2988 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2882 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2989 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2883 } 2990 }
2884 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2885 spin_unlock(&vcpu->kvm->mmu_lock); 2991 spin_unlock(&vcpu->kvm->mmu_lock);
2992 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
2886 return; 2993 return;
2887 } 2994 }
2995
2996 spin_lock(&vcpu->kvm->mmu_lock);
2888 for (i = 0; i < 4; ++i) { 2997 for (i = 0; i < 4; ++i) {
2889 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2998 hpa_t root = vcpu->arch.mmu.pae_root[i];
2890 2999
@@ -3148,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
3148 return spte; 3257 return spte;
3149} 3258}
3150 3259
3151/*
3152 * If it is a real mmio page fault, return 1 and emulat the instruction
3153 * directly, return 0 to let CPU fault again on the address, -1 is
3154 * returned if bug is detected.
3155 */
3156int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3260int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3157{ 3261{
3158 u64 spte; 3262 u64 spte;
3159 3263
3160 if (quickly_check_mmio_pf(vcpu, addr, direct)) 3264 if (quickly_check_mmio_pf(vcpu, addr, direct))
3161 return 1; 3265 return RET_MMIO_PF_EMULATE;
3162 3266
3163 spte = walk_shadow_page_get_mmio_spte(vcpu, addr); 3267 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
3164 3268
@@ -3166,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3166 gfn_t gfn = get_mmio_spte_gfn(spte); 3270 gfn_t gfn = get_mmio_spte_gfn(spte);
3167 unsigned access = get_mmio_spte_access(spte); 3271 unsigned access = get_mmio_spte_access(spte);
3168 3272
3273 if (!check_mmio_spte(vcpu->kvm, spte))
3274 return RET_MMIO_PF_INVALID;
3275
3169 if (direct) 3276 if (direct)
3170 addr = 0; 3277 addr = 0;
3171 3278
3172 trace_handle_mmio_page_fault(addr, gfn, access); 3279 trace_handle_mmio_page_fault(addr, gfn, access);
3173 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3280 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3174 return 1; 3281 return RET_MMIO_PF_EMULATE;
3175 } 3282 }
3176 3283
3177 /* 3284 /*
@@ -3179,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3179 * it's a BUG if the gfn is not a mmio page. 3286 * it's a BUG if the gfn is not a mmio page.
3180 */ 3287 */
3181 if (direct && !check_direct_spte_mmio_pf(spte)) 3288 if (direct && !check_direct_spte_mmio_pf(spte))
3182 return -1; 3289 return RET_MMIO_PF_BUG;
3183 3290
3184 /* 3291 /*
3185 * If the page table is zapped by other cpus, let CPU fault again on 3292 * If the page table is zapped by other cpus, let CPU fault again on
3186 * the address. 3293 * the address.
3187 */ 3294 */
3188 return 0; 3295 return RET_MMIO_PF_RETRY;
3189} 3296}
3190EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); 3297EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
3191 3298
@@ -3195,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
3195 int ret; 3302 int ret;
3196 3303
3197 ret = handle_mmio_page_fault_common(vcpu, addr, direct); 3304 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
3198 WARN_ON(ret < 0); 3305 WARN_ON(ret == RET_MMIO_PF_BUG);
3199 return ret; 3306 return ret;
3200} 3307}
3201 3308
@@ -3207,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
3207 3314
3208 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3315 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
3209 3316
3210 if (unlikely(error_code & PFERR_RSVD_MASK)) 3317 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3211 return handle_mmio_page_fault(vcpu, gva, error_code, true); 3318 r = handle_mmio_page_fault(vcpu, gva, error_code, true);
3319
3320 if (likely(r != RET_MMIO_PF_INVALID))
3321 return r;
3322 }
3212 3323
3213 r = mmu_topup_memory_caches(vcpu); 3324 r = mmu_topup_memory_caches(vcpu);
3214 if (r) 3325 if (r)
@@ -3284,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
3284 ASSERT(vcpu); 3395 ASSERT(vcpu);
3285 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3396 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
3286 3397
3287 if (unlikely(error_code & PFERR_RSVD_MASK)) 3398 if (unlikely(error_code & PFERR_RSVD_MASK)) {
3288 return handle_mmio_page_fault(vcpu, gpa, error_code, true); 3399 r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
3400
3401 if (likely(r != RET_MMIO_PF_INVALID))
3402 return r;
3403 }
3289 3404
3290 r = mmu_topup_memory_caches(vcpu); 3405 r = mmu_topup_memory_caches(vcpu);
3291 if (r) 3406 if (r)
@@ -3391,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
3391 *access &= mask; 3506 *access &= mask;
3392} 3507}
3393 3508
3394static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, 3509static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
3395 int *nr_present) 3510 unsigned access, int *nr_present)
3396{ 3511{
3397 if (unlikely(is_mmio_spte(*sptep))) { 3512 if (unlikely(is_mmio_spte(*sptep))) {
3398 if (gfn != get_mmio_spte_gfn(*sptep)) { 3513 if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3401,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3401 } 3516 }
3402 3517
3403 (*nr_present)++; 3518 (*nr_present)++;
3404 mark_mmio_spte(sptep, gfn, access); 3519 mark_mmio_spte(kvm, sptep, gfn, access);
3405 return true; 3520 return true;
3406 } 3521 }
3407 3522
@@ -3764,9 +3879,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
3764 if (r) 3879 if (r)
3765 goto out; 3880 goto out;
3766 r = mmu_alloc_roots(vcpu); 3881 r = mmu_alloc_roots(vcpu);
3767 spin_lock(&vcpu->kvm->mmu_lock); 3882 kvm_mmu_sync_roots(vcpu);
3768 mmu_sync_roots(vcpu);
3769 spin_unlock(&vcpu->kvm->mmu_lock);
3770 if (r) 3883 if (r)
3771 goto out; 3884 goto out;
3772 /* set_cr3() should ensure TLB has been flushed */ 3885 /* set_cr3() should ensure TLB has been flushed */
@@ -4179,39 +4292,107 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
4179 spin_unlock(&kvm->mmu_lock); 4292 spin_unlock(&kvm->mmu_lock);
4180} 4293}
4181 4294
4182void kvm_mmu_zap_all(struct kvm *kvm) 4295#define BATCH_ZAP_PAGES 10
4296static void kvm_zap_obsolete_pages(struct kvm *kvm)
4183{ 4297{
4184 struct kvm_mmu_page *sp, *node; 4298 struct kvm_mmu_page *sp, *node;
4185 LIST_HEAD(invalid_list); 4299 int batch = 0;
4186 4300
4187 spin_lock(&kvm->mmu_lock);
4188restart: 4301restart:
4189 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 4302 list_for_each_entry_safe_reverse(sp, node,
4190 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 4303 &kvm->arch.active_mmu_pages, link) {
4304 int ret;
4305
4306 /*
4307 * No obsolete page exists before new created page since
4308 * active_mmu_pages is the FIFO list.
4309 */
4310 if (!is_obsolete_sp(kvm, sp))
4311 break;
4312
4313 /*
4314 * Since we are reversely walking the list and the invalid
4315 * list will be moved to the head, skip the invalid page
4316 * can help us to avoid the infinity list walking.
4317 */
4318 if (sp->role.invalid)
4319 continue;
4320
4321 /*
4322 * Need not flush tlb since we only zap the sp with invalid
4323 * generation number.
4324 */
4325 if (batch >= BATCH_ZAP_PAGES &&
4326 cond_resched_lock(&kvm->mmu_lock)) {
4327 batch = 0;
4328 goto restart;
4329 }
4330
4331 ret = kvm_mmu_prepare_zap_page(kvm, sp,
4332 &kvm->arch.zapped_obsolete_pages);
4333 batch += ret;
4334
4335 if (ret)
4191 goto restart; 4336 goto restart;
4337 }
4192 4338
4193 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4339 /*
4194 spin_unlock(&kvm->mmu_lock); 4340 * Should flush tlb before free page tables since lockless-walking
4341 * may use the pages.
4342 */
4343 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
4195} 4344}
4196 4345
4197void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) 4346/*
4347 * Fast invalidate all shadow pages and use lock-break technique
4348 * to zap obsolete pages.
4349 *
4350 * It's required when memslot is being deleted or VM is being
4351 * destroyed, in these cases, we should ensure that KVM MMU does
4352 * not use any resource of the being-deleted slot or all slots
4353 * after calling the function.
4354 */
4355void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
4198{ 4356{
4199 struct kvm_mmu_page *sp, *node;
4200 LIST_HEAD(invalid_list);
4201
4202 spin_lock(&kvm->mmu_lock); 4357 spin_lock(&kvm->mmu_lock);
4203restart: 4358 trace_kvm_mmu_invalidate_zap_all_pages(kvm);
4204 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { 4359 kvm->arch.mmu_valid_gen++;
4205 if (!sp->mmio_cached)
4206 continue;
4207 if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
4208 goto restart;
4209 }
4210 4360
4211 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4361 /*
4362 * Notify all vcpus to reload its shadow page table
4363 * and flush TLB. Then all vcpus will switch to new
4364 * shadow page table with the new mmu_valid_gen.
4365 *
4366 * Note: we should do this under the protection of
4367 * mmu-lock, otherwise, vcpu would purge shadow page
4368 * but miss tlb flush.
4369 */
4370 kvm_reload_remote_mmus(kvm);
4371
4372 kvm_zap_obsolete_pages(kvm);
4212 spin_unlock(&kvm->mmu_lock); 4373 spin_unlock(&kvm->mmu_lock);
4213} 4374}
4214 4375
4376static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
4377{
4378 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
4379}
4380
4381void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
4382{
4383 /*
4384 * The very rare case: if the generation-number is round,
4385 * zap all shadow pages.
4386 *
4387 * The max value is MMIO_MAX_GEN - 1 since it is not called
4388 * when mark memslot invalid.
4389 */
4390 if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
4391 printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
4392 kvm_mmu_invalidate_zap_all_pages(kvm);
4393 }
4394}
4395
4215static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) 4396static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4216{ 4397{
4217 struct kvm *kvm; 4398 struct kvm *kvm;
@@ -4240,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
4240 * want to shrink a VM that only started to populate its MMU 4421 * want to shrink a VM that only started to populate its MMU
4241 * anyway. 4422 * anyway.
4242 */ 4423 */
4243 if (!kvm->arch.n_used_mmu_pages) 4424 if (!kvm->arch.n_used_mmu_pages &&
4425 !kvm_has_zapped_obsolete_pages(kvm))
4244 continue; 4426 continue;
4245 4427
4246 idx = srcu_read_lock(&kvm->srcu); 4428 idx = srcu_read_lock(&kvm->srcu);
4247 spin_lock(&kvm->mmu_lock); 4429 spin_lock(&kvm->mmu_lock);
4248 4430
4431 if (kvm_has_zapped_obsolete_pages(kvm)) {
4432 kvm_mmu_commit_zap_page(kvm,
4433 &kvm->arch.zapped_obsolete_pages);
4434 goto unlock;
4435 }
4436
4249 prepare_zap_oldest_mmu_page(kvm, &invalid_list); 4437 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
4250 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4438 kvm_mmu_commit_zap_page(kvm, &invalid_list);
4251 4439
4440unlock:
4252 spin_unlock(&kvm->mmu_lock); 4441 spin_unlock(&kvm->mmu_lock);
4253 srcu_read_unlock(&kvm->srcu, idx); 4442 srcu_read_unlock(&kvm->srcu, idx);
4254 4443