aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c1228
1 files changed, 768 insertions, 460 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b76..8e8da7960db 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,7 +22,6 @@
22#include "mmu.h" 22#include "mmu.h"
23#include "x86.h" 23#include "x86.h"
24#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
26 25
27#include <linux/kvm_host.h> 26#include <linux/kvm_host.h>
28#include <linux/types.h> 27#include <linux/types.h>
@@ -148,7 +147,7 @@ module_param(oos_shadow, bool, 0644);
148#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 147#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
149 | PT64_NX_MASK) 148 | PT64_NX_MASK)
150 149
151#define RMAP_EXT 4 150#define PTE_LIST_EXT 4
152 151
153#define ACC_EXEC_MASK 1 152#define ACC_EXEC_MASK 1
154#define ACC_WRITE_MASK PT_WRITABLE_MASK 153#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -164,16 +163,16 @@ module_param(oos_shadow, bool, 0644);
164 163
165#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 164#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
166 165
167struct kvm_rmap_desc { 166struct pte_list_desc {
168 u64 *sptes[RMAP_EXT]; 167 u64 *sptes[PTE_LIST_EXT];
169 struct kvm_rmap_desc *more; 168 struct pte_list_desc *more;
170}; 169};
171 170
172struct kvm_shadow_walk_iterator { 171struct kvm_shadow_walk_iterator {
173 u64 addr; 172 u64 addr;
174 hpa_t shadow_addr; 173 hpa_t shadow_addr;
175 int level;
176 u64 *sptep; 174 u64 *sptep;
175 int level;
177 unsigned index; 176 unsigned index;
178}; 177};
179 178
@@ -182,32 +181,68 @@ struct kvm_shadow_walk_iterator {
182 shadow_walk_okay(&(_walker)); \ 181 shadow_walk_okay(&(_walker)); \
183 shadow_walk_next(&(_walker))) 182 shadow_walk_next(&(_walker)))
184 183
185typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 184#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
185 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
186 shadow_walk_okay(&(_walker)) && \
187 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
188 __shadow_walk_next(&(_walker), spte))
186 189
187static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_list_desc_cache;
188static struct kmem_cache *rmap_desc_cache;
189static struct kmem_cache *mmu_page_header_cache; 191static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages; 192static struct percpu_counter kvm_total_used_mmu_pages;
191 193
192static u64 __read_mostly shadow_trap_nonpresent_pte;
193static u64 __read_mostly shadow_notrap_nonpresent_pte;
194static u64 __read_mostly shadow_nx_mask; 194static u64 __read_mostly shadow_nx_mask;
195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
196static u64 __read_mostly shadow_user_mask; 196static u64 __read_mostly shadow_user_mask;
197static u64 __read_mostly shadow_accessed_mask; 197static u64 __read_mostly shadow_accessed_mask;
198static u64 __read_mostly shadow_dirty_mask; 198static u64 __read_mostly shadow_dirty_mask;
199static u64 __read_mostly shadow_mmio_mask;
199 200
200static inline u64 rsvd_bits(int s, int e) 201static void mmu_spte_set(u64 *sptep, u64 spte);
202
203void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
201{ 204{
202 return ((1ULL << (e - s + 1)) - 1) << s; 205 shadow_mmio_mask = mmio_mask;
203} 206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
204 208
205void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) 209static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
206{ 210{
207 shadow_trap_nonpresent_pte = trap_pte; 211 access &= ACC_WRITE_MASK | ACC_USER_MASK;
208 shadow_notrap_nonpresent_pte = notrap_pte; 212
213 trace_mark_mmio_spte(sptep, gfn, access);
214 mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
215}
216
217static bool is_mmio_spte(u64 spte)
218{
219 return (spte & shadow_mmio_mask) == shadow_mmio_mask;
220}
221
222static gfn_t get_mmio_spte_gfn(u64 spte)
223{
224 return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
225}
226
227static unsigned get_mmio_spte_access(u64 spte)
228{
229 return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
230}
231
232static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
233{
234 if (unlikely(is_noslot_pfn(pfn))) {
235 mark_mmio_spte(sptep, gfn, access);
236 return true;
237 }
238
239 return false;
240}
241
242static inline u64 rsvd_bits(int s, int e)
243{
244 return ((1ULL << (e - s + 1)) - 1) << s;
209} 245}
210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
211 246
212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 247void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
213 u64 dirty_mask, u64 nx_mask, u64 x_mask) 248 u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +255,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
220} 255}
221EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 256EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
222 257
223static bool is_write_protection(struct kvm_vcpu *vcpu)
224{
225 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
226}
227
228static int is_cpuid_PSE36(void) 258static int is_cpuid_PSE36(void)
229{ 259{
230 return 1; 260 return 1;
@@ -237,8 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
237 267
238static int is_shadow_present_pte(u64 pte) 268static int is_shadow_present_pte(u64 pte)
239{ 269{
240 return pte != shadow_trap_nonpresent_pte 270 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
241 && pte != shadow_notrap_nonpresent_pte;
242} 271}
243 272
244static int is_large_pte(u64 pte) 273static int is_large_pte(u64 pte)
@@ -246,11 +275,6 @@ static int is_large_pte(u64 pte)
246 return pte & PT_PAGE_SIZE_MASK; 275 return pte & PT_PAGE_SIZE_MASK;
247} 276}
248 277
249static int is_writable_pte(unsigned long pte)
250{
251 return pte & PT_WRITABLE_MASK;
252}
253
254static int is_dirty_gpte(unsigned long pte) 278static int is_dirty_gpte(unsigned long pte)
255{ 279{
256 return pte & PT_DIRTY_MASK; 280 return pte & PT_DIRTY_MASK;
@@ -282,26 +306,155 @@ static gfn_t pse36_gfn_delta(u32 gpte)
282 return (gpte & PT32_DIR_PSE36_MASK) << shift; 306 return (gpte & PT32_DIR_PSE36_MASK) << shift;
283} 307}
284 308
309#ifdef CONFIG_X86_64
285static void __set_spte(u64 *sptep, u64 spte) 310static void __set_spte(u64 *sptep, u64 spte)
286{ 311{
287 set_64bit(sptep, spte); 312 *sptep = spte;
288} 313}
289 314
290static u64 __xchg_spte(u64 *sptep, u64 new_spte) 315static void __update_clear_spte_fast(u64 *sptep, u64 spte)
291{ 316{
292#ifdef CONFIG_X86_64 317 *sptep = spte;
293 return xchg(sptep, new_spte); 318}
319
320static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
321{
322 return xchg(sptep, spte);
323}
324
325static u64 __get_spte_lockless(u64 *sptep)
326{
327 return ACCESS_ONCE(*sptep);
328}
329
330static bool __check_direct_spte_mmio_pf(u64 spte)
331{
332 /* It is valid if the spte is zapped. */
333 return spte == 0ull;
334}
294#else 335#else
295 u64 old_spte; 336union split_spte {
337 struct {
338 u32 spte_low;
339 u32 spte_high;
340 };
341 u64 spte;
342};
296 343
297 do { 344static void count_spte_clear(u64 *sptep, u64 spte)
298 old_spte = *sptep; 345{
299 } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 346 struct kvm_mmu_page *sp = page_header(__pa(sptep));
300 347
301 return old_spte; 348 if (is_shadow_present_pte(spte))
302#endif 349 return;
350
351 /* Ensure the spte is completely set before we increase the count */
352 smp_wmb();
353 sp->clear_spte_count++;
354}
355
356static void __set_spte(u64 *sptep, u64 spte)
357{
358 union split_spte *ssptep, sspte;
359
360 ssptep = (union split_spte *)sptep;
361 sspte = (union split_spte)spte;
362
363 ssptep->spte_high = sspte.spte_high;
364
365 /*
366 * If we map the spte from nonpresent to present, We should store
367 * the high bits firstly, then set present bit, so cpu can not
368 * fetch this spte while we are setting the spte.
369 */
370 smp_wmb();
371
372 ssptep->spte_low = sspte.spte_low;
303} 373}
304 374
375static void __update_clear_spte_fast(u64 *sptep, u64 spte)
376{
377 union split_spte *ssptep, sspte;
378
379 ssptep = (union split_spte *)sptep;
380 sspte = (union split_spte)spte;
381
382 ssptep->spte_low = sspte.spte_low;
383
384 /*
385 * If we map the spte from present to nonpresent, we should clear
386 * present bit firstly to avoid vcpu fetch the old high bits.
387 */
388 smp_wmb();
389
390 ssptep->spte_high = sspte.spte_high;
391 count_spte_clear(sptep, spte);
392}
393
394static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
395{
396 union split_spte *ssptep, sspte, orig;
397
398 ssptep = (union split_spte *)sptep;
399 sspte = (union split_spte)spte;
400
401 /* xchg acts as a barrier before the setting of the high bits */
402 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
403 orig.spte_high = ssptep->spte_high;
404 ssptep->spte_high = sspte.spte_high;
405 count_spte_clear(sptep, spte);
406
407 return orig.spte;
408}
409
410/*
411 * The idea using the light way get the spte on x86_32 guest is from
412 * gup_get_pte(arch/x86/mm/gup.c).
413 * The difference is we can not catch the spte tlb flush if we leave
414 * guest mode, so we emulate it by increase clear_spte_count when spte
415 * is cleared.
416 */
417static u64 __get_spte_lockless(u64 *sptep)
418{
419 struct kvm_mmu_page *sp = page_header(__pa(sptep));
420 union split_spte spte, *orig = (union split_spte *)sptep;
421 int count;
422
423retry:
424 count = sp->clear_spte_count;
425 smp_rmb();
426
427 spte.spte_low = orig->spte_low;
428 smp_rmb();
429
430 spte.spte_high = orig->spte_high;
431 smp_rmb();
432
433 if (unlikely(spte.spte_low != orig->spte_low ||
434 count != sp->clear_spte_count))
435 goto retry;
436
437 return spte.spte;
438}
439
440static bool __check_direct_spte_mmio_pf(u64 spte)
441{
442 union split_spte sspte = (union split_spte)spte;
443 u32 high_mmio_mask = shadow_mmio_mask >> 32;
444
445 /* It is valid if the spte is zapped. */
446 if (spte == 0ull)
447 return true;
448
449 /* It is valid if the spte is being zapped. */
450 if (sspte.spte_low == 0ull &&
451 (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
452 return true;
453
454 return false;
455}
456#endif
457
305static bool spte_has_volatile_bits(u64 spte) 458static bool spte_has_volatile_bits(u64 spte)
306{ 459{
307 if (!shadow_accessed_mask) 460 if (!shadow_accessed_mask)
@@ -322,12 +475,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
322 return (old_spte & bit_mask) && !(new_spte & bit_mask); 475 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323} 476}
324 477
325static void update_spte(u64 *sptep, u64 new_spte) 478/* Rules for using mmu_spte_set:
479 * Set the sptep from nonpresent to present.
480 * Note: the sptep being assigned *must* be either not present
481 * or in a state where the hardware will not attempt to update
482 * the spte.
483 */
484static void mmu_spte_set(u64 *sptep, u64 new_spte)
485{
486 WARN_ON(is_shadow_present_pte(*sptep));
487 __set_spte(sptep, new_spte);
488}
489
490/* Rules for using mmu_spte_update:
491 * Update the state bits, it means the mapped pfn is not changged.
492 */
493static void mmu_spte_update(u64 *sptep, u64 new_spte)
326{ 494{
327 u64 mask, old_spte = *sptep; 495 u64 mask, old_spte = *sptep;
328 496
329 WARN_ON(!is_rmap_spte(new_spte)); 497 WARN_ON(!is_rmap_spte(new_spte));
330 498
499 if (!is_shadow_present_pte(old_spte))
500 return mmu_spte_set(sptep, new_spte);
501
331 new_spte |= old_spte & shadow_dirty_mask; 502 new_spte |= old_spte & shadow_dirty_mask;
332 503
333 mask = shadow_accessed_mask; 504 mask = shadow_accessed_mask;
@@ -335,9 +506,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
335 mask |= shadow_dirty_mask; 506 mask |= shadow_dirty_mask;
336 507
337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) 508 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
338 __set_spte(sptep, new_spte); 509 __update_clear_spte_fast(sptep, new_spte);
339 else 510 else
340 old_spte = __xchg_spte(sptep, new_spte); 511 old_spte = __update_clear_spte_slow(sptep, new_spte);
341 512
342 if (!shadow_accessed_mask) 513 if (!shadow_accessed_mask)
343 return; 514 return;
@@ -348,6 +519,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 519 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
349} 520}
350 521
522/*
523 * Rules for using mmu_spte_clear_track_bits:
524 * It sets the sptep from present to nonpresent, and track the
525 * state bits, it is used to clear the last level sptep.
526 */
527static int mmu_spte_clear_track_bits(u64 *sptep)
528{
529 pfn_t pfn;
530 u64 old_spte = *sptep;
531
532 if (!spte_has_volatile_bits(old_spte))
533 __update_clear_spte_fast(sptep, 0ull);
534 else
535 old_spte = __update_clear_spte_slow(sptep, 0ull);
536
537 if (!is_rmap_spte(old_spte))
538 return 0;
539
540 pfn = spte_to_pfn(old_spte);
541 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
542 kvm_set_pfn_accessed(pfn);
543 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
544 kvm_set_pfn_dirty(pfn);
545 return 1;
546}
547
548/*
549 * Rules for using mmu_spte_clear_no_track:
550 * Directly clear spte without caring the state bits of sptep,
551 * it is used to set the upper level spte.
552 */
553static void mmu_spte_clear_no_track(u64 *sptep)
554{
555 __update_clear_spte_fast(sptep, 0ull);
556}
557
558static u64 mmu_spte_get_lockless(u64 *sptep)
559{
560 return __get_spte_lockless(sptep);
561}
562
563static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
564{
565 rcu_read_lock();
566 atomic_inc(&vcpu->kvm->arch.reader_counter);
567
568 /* Increase the counter before walking shadow page table */
569 smp_mb__after_atomic_inc();
570}
571
572static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
573{
574 /* Decrease the counter after walking shadow page table finished */
575 smp_mb__before_atomic_dec();
576 atomic_dec(&vcpu->kvm->arch.reader_counter);
577 rcu_read_unlock();
578}
579
351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 580static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
352 struct kmem_cache *base_cache, int min) 581 struct kmem_cache *base_cache, int min)
353{ 582{
@@ -397,12 +626,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
397{ 626{
398 int r; 627 int r;
399 628
400 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache, 629 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
401 pte_chain_cache, 4); 630 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
402 if (r)
403 goto out;
404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
406 if (r) 631 if (r)
407 goto out; 632 goto out;
408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 633 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +641,8 @@ out:
416 641
417static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 642static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
418{ 643{
419 mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 644 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
420 mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 645 pte_list_desc_cache);
421 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 646 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
422 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 647 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
423 mmu_page_header_cache); 648 mmu_page_header_cache);
@@ -433,26 +658,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
433 return p; 658 return p;
434} 659}
435 660
436static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu) 661static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
437{ 662{
438 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache, 663 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
439 sizeof(struct kvm_pte_chain)); 664 sizeof(struct pte_list_desc));
440} 665}
441 666
442static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 667static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
443{ 668{
444 kmem_cache_free(pte_chain_cache, pc); 669 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
445}
446
447static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
448{
449 return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
450 sizeof(struct kvm_rmap_desc));
451}
452
453static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
454{
455 kmem_cache_free(rmap_desc_cache, rd);
456} 670}
457 671
458static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 672static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +712,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
498 linfo = lpage_info_slot(gfn, slot, i); 712 linfo = lpage_info_slot(gfn, slot, i);
499 linfo->write_count += 1; 713 linfo->write_count += 1;
500 } 714 }
715 kvm->arch.indirect_shadow_pages++;
501} 716}
502 717
503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 718static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +728,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
513 linfo->write_count -= 1; 728 linfo->write_count -= 1;
514 WARN_ON(linfo->write_count < 0); 729 WARN_ON(linfo->write_count < 0);
515 } 730 }
731 kvm->arch.indirect_shadow_pages--;
516} 732}
517 733
518static int has_wrprotected_page(struct kvm *kvm, 734static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +804,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
588} 804}
589 805
590/* 806/*
591 * Take gfn and return the reverse mapping to it. 807 * Pte mapping structures:
592 */
593
594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
595{
596 struct kvm_memory_slot *slot;
597 struct kvm_lpage_info *linfo;
598
599 slot = gfn_to_memslot(kvm, gfn);
600 if (likely(level == PT_PAGE_TABLE_LEVEL))
601 return &slot->rmap[gfn - slot->base_gfn];
602
603 linfo = lpage_info_slot(gfn, slot, level);
604
605 return &linfo->rmap_pde;
606}
607
608/*
609 * Reverse mapping data structures:
610 * 808 *
611 * If rmapp bit zero is zero, then rmapp point to the shadw page table entry 809 * If pte_list bit zero is zero, then pte_list point to the spte.
612 * that points to page_address(page).
613 * 810 *
614 * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc 811 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
615 * containing more mappings. 812 * pte_list_desc containing more mappings.
616 * 813 *
617 * Returns the number of rmap entries before the spte was added or zero if 814 * Returns the number of pte entries before the spte was added or zero if
618 * the spte was not added. 815 * the spte was not added.
619 * 816 *
620 */ 817 */
621static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 818static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
819 unsigned long *pte_list)
622{ 820{
623 struct kvm_mmu_page *sp; 821 struct pte_list_desc *desc;
624 struct kvm_rmap_desc *desc;
625 unsigned long *rmapp;
626 int i, count = 0; 822 int i, count = 0;
627 823
628 if (!is_rmap_spte(*spte)) 824 if (!*pte_list) {
629 return count; 825 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
630 sp = page_header(__pa(spte)); 826 *pte_list = (unsigned long)spte;
631 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 827 } else if (!(*pte_list & 1)) {
632 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 828 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
633 if (!*rmapp) { 829 desc = mmu_alloc_pte_list_desc(vcpu);
634 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); 830 desc->sptes[0] = (u64 *)*pte_list;
635 *rmapp = (unsigned long)spte;
636 } else if (!(*rmapp & 1)) {
637 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
638 desc = mmu_alloc_rmap_desc(vcpu);
639 desc->sptes[0] = (u64 *)*rmapp;
640 desc->sptes[1] = spte; 831 desc->sptes[1] = spte;
641 *rmapp = (unsigned long)desc | 1; 832 *pte_list = (unsigned long)desc | 1;
642 ++count; 833 ++count;
643 } else { 834 } else {
644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 835 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 836 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
646 while (desc->sptes[RMAP_EXT-1] && desc->more) { 837 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
647 desc = desc->more; 838 desc = desc->more;
648 count += RMAP_EXT; 839 count += PTE_LIST_EXT;
649 } 840 }
650 if (desc->sptes[RMAP_EXT-1]) { 841 if (desc->sptes[PTE_LIST_EXT-1]) {
651 desc->more = mmu_alloc_rmap_desc(vcpu); 842 desc->more = mmu_alloc_pte_list_desc(vcpu);
652 desc = desc->more; 843 desc = desc->more;
653 } 844 }
654 for (i = 0; desc->sptes[i]; ++i) 845 for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +849,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
658 return count; 849 return count;
659} 850}
660 851
661static void rmap_desc_remove_entry(unsigned long *rmapp, 852static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
662 struct kvm_rmap_desc *desc, 853{
663 int i, 854 struct pte_list_desc *desc;
664 struct kvm_rmap_desc *prev_desc) 855 u64 *prev_spte;
856 int i;
857
858 if (!*pte_list)
859 return NULL;
860 else if (!(*pte_list & 1)) {
861 if (!spte)
862 return (u64 *)*pte_list;
863 return NULL;
864 }
865 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
866 prev_spte = NULL;
867 while (desc) {
868 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
869 if (prev_spte == spte)
870 return desc->sptes[i];
871 prev_spte = desc->sptes[i];
872 }
873 desc = desc->more;
874 }
875 return NULL;
876}
877
878static void
879pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
880 int i, struct pte_list_desc *prev_desc)
665{ 881{
666 int j; 882 int j;
667 883
668 for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j) 884 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
669 ; 885 ;
670 desc->sptes[i] = desc->sptes[j]; 886 desc->sptes[i] = desc->sptes[j];
671 desc->sptes[j] = NULL; 887 desc->sptes[j] = NULL;
672 if (j != 0) 888 if (j != 0)
673 return; 889 return;
674 if (!prev_desc && !desc->more) 890 if (!prev_desc && !desc->more)
675 *rmapp = (unsigned long)desc->sptes[0]; 891 *pte_list = (unsigned long)desc->sptes[0];
676 else 892 else
677 if (prev_desc) 893 if (prev_desc)
678 prev_desc->more = desc->more; 894 prev_desc->more = desc->more;
679 else 895 else
680 *rmapp = (unsigned long)desc->more | 1; 896 *pte_list = (unsigned long)desc->more | 1;
681 mmu_free_rmap_desc(desc); 897 mmu_free_pte_list_desc(desc);
682} 898}
683 899
684static void rmap_remove(struct kvm *kvm, u64 *spte) 900static void pte_list_remove(u64 *spte, unsigned long *pte_list)
685{ 901{
686 struct kvm_rmap_desc *desc; 902 struct pte_list_desc *desc;
687 struct kvm_rmap_desc *prev_desc; 903 struct pte_list_desc *prev_desc;
688 struct kvm_mmu_page *sp;
689 gfn_t gfn;
690 unsigned long *rmapp;
691 int i; 904 int i;
692 905
693 sp = page_header(__pa(spte)); 906 if (!*pte_list) {
694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 907 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
696 if (!*rmapp) {
697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
698 BUG(); 908 BUG();
699 } else if (!(*rmapp & 1)) { 909 } else if (!(*pte_list & 1)) {
700 rmap_printk("rmap_remove: %p 1->0\n", spte); 910 rmap_printk("pte_list_remove: %p 1->0\n", spte);
701 if ((u64 *)*rmapp != spte) { 911 if ((u64 *)*pte_list != spte) {
702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); 912 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
703 BUG(); 913 BUG();
704 } 914 }
705 *rmapp = 0; 915 *pte_list = 0;
706 } else { 916 } else {
707 rmap_printk("rmap_remove: %p many->many\n", spte); 917 rmap_printk("pte_list_remove: %p many->many\n", spte);
708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 918 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
709 prev_desc = NULL; 919 prev_desc = NULL;
710 while (desc) { 920 while (desc) {
711 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) 921 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
712 if (desc->sptes[i] == spte) { 922 if (desc->sptes[i] == spte) {
713 rmap_desc_remove_entry(rmapp, 923 pte_list_desc_remove_entry(pte_list,
714 desc, i, 924 desc, i,
715 prev_desc); 925 prev_desc);
716 return; 926 return;
@@ -718,62 +928,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
718 prev_desc = desc; 928 prev_desc = desc;
719 desc = desc->more; 929 desc = desc->more;
720 } 930 }
721 pr_err("rmap_remove: %p many->many\n", spte); 931 pr_err("pte_list_remove: %p many->many\n", spte);
722 BUG(); 932 BUG();
723 } 933 }
724} 934}
725 935
726static int set_spte_track_bits(u64 *sptep, u64 new_spte) 936typedef void (*pte_list_walk_fn) (u64 *spte);
937static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
727{ 938{
728 pfn_t pfn; 939 struct pte_list_desc *desc;
729 u64 old_spte = *sptep; 940 int i;
730 941
731 if (!spte_has_volatile_bits(old_spte)) 942 if (!*pte_list)
732 __set_spte(sptep, new_spte); 943 return;
733 else
734 old_spte = __xchg_spte(sptep, new_spte);
735 944
736 if (!is_rmap_spte(old_spte)) 945 if (!(*pte_list & 1))
737 return 0; 946 return fn((u64 *)*pte_list);
738 947
739 pfn = spte_to_pfn(old_spte); 948 desc = (struct pte_list_desc *)(*pte_list & ~1ul);
740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 949 while (desc) {
741 kvm_set_pfn_accessed(pfn); 950 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 951 fn(desc->sptes[i]);
743 kvm_set_pfn_dirty(pfn); 952 desc = desc->more;
744 return 1; 953 }
745} 954}
746 955
747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 956/*
957 * Take gfn and return the reverse mapping to it.
958 */
959static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
748{ 960{
749 if (set_spte_track_bits(sptep, new_spte)) 961 struct kvm_memory_slot *slot;
750 rmap_remove(kvm, sptep); 962 struct kvm_lpage_info *linfo;
963
964 slot = gfn_to_memslot(kvm, gfn);
965 if (likely(level == PT_PAGE_TABLE_LEVEL))
966 return &slot->rmap[gfn - slot->base_gfn];
967
968 linfo = lpage_info_slot(gfn, slot, level);
969
970 return &linfo->rmap_pde;
971}
972
973static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
974{
975 struct kvm_mmu_page *sp;
976 unsigned long *rmapp;
977
978 sp = page_header(__pa(spte));
979 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
980 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
981 return pte_list_add(vcpu, spte, rmapp);
751} 982}
752 983
753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 984static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
754{ 985{
755 struct kvm_rmap_desc *desc; 986 return pte_list_next(rmapp, spte);
756 u64 *prev_spte; 987}
757 int i;
758 988
759 if (!*rmapp) 989static void rmap_remove(struct kvm *kvm, u64 *spte)
760 return NULL; 990{
761 else if (!(*rmapp & 1)) { 991 struct kvm_mmu_page *sp;
762 if (!spte) 992 gfn_t gfn;
763 return (u64 *)*rmapp; 993 unsigned long *rmapp;
764 return NULL; 994
765 } 995 sp = page_header(__pa(spte));
766 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 996 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
767 prev_spte = NULL; 997 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
768 while (desc) { 998 pte_list_remove(spte, rmapp);
769 for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { 999}
770 if (prev_spte == spte) 1000
771 return desc->sptes[i]; 1001static void drop_spte(struct kvm *kvm, u64 *sptep)
772 prev_spte = desc->sptes[i]; 1002{
773 } 1003 if (mmu_spte_clear_track_bits(sptep))
774 desc = desc->more; 1004 rmap_remove(kvm, sptep);
775 }
776 return NULL;
777} 1005}
778 1006
779static int rmap_write_protect(struct kvm *kvm, u64 gfn) 1007static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1018,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
790 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1018 BUG_ON(!(*spte & PT_PRESENT_MASK));
791 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 1019 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
792 if (is_writable_pte(*spte)) { 1020 if (is_writable_pte(*spte)) {
793 update_spte(spte, *spte & ~PT_WRITABLE_MASK); 1021 mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
794 write_protected = 1; 1022 write_protected = 1;
795 } 1023 }
796 spte = rmap_next(kvm, rmapp, spte); 1024 spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1035,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
807 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 1035 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
808 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 1036 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
809 if (is_writable_pte(*spte)) { 1037 if (is_writable_pte(*spte)) {
810 drop_spte(kvm, spte, 1038 drop_spte(kvm, spte);
811 shadow_trap_nonpresent_pte);
812 --kvm->stat.lpages; 1039 --kvm->stat.lpages;
813 spte = NULL; 1040 spte = NULL;
814 write_protected = 1; 1041 write_protected = 1;
@@ -829,7 +1056,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
829 while ((spte = rmap_next(kvm, rmapp, NULL))) { 1056 while ((spte = rmap_next(kvm, rmapp, NULL))) {
830 BUG_ON(!(*spte & PT_PRESENT_MASK)); 1057 BUG_ON(!(*spte & PT_PRESENT_MASK));
831 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 1058 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
832 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1059 drop_spte(kvm, spte);
833 need_tlb_flush = 1; 1060 need_tlb_flush = 1;
834 } 1061 }
835 return need_tlb_flush; 1062 return need_tlb_flush;
@@ -851,7 +1078,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
851 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 1078 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
852 need_flush = 1; 1079 need_flush = 1;
853 if (pte_write(*ptep)) { 1080 if (pte_write(*ptep)) {
854 drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 1081 drop_spte(kvm, spte);
855 spte = rmap_next(kvm, rmapp, NULL); 1082 spte = rmap_next(kvm, rmapp, NULL);
856 } else { 1083 } else {
857 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); 1084 new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1087,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
860 new_spte &= ~PT_WRITABLE_MASK; 1087 new_spte &= ~PT_WRITABLE_MASK;
861 new_spte &= ~SPTE_HOST_WRITEABLE; 1088 new_spte &= ~SPTE_HOST_WRITEABLE;
862 new_spte &= ~shadow_accessed_mask; 1089 new_spte &= ~shadow_accessed_mask;
863 set_spte_track_bits(spte, new_spte); 1090 mmu_spte_clear_track_bits(spte);
1091 mmu_spte_set(spte, new_spte);
864 spte = rmap_next(kvm, rmapp, spte); 1092 spte = rmap_next(kvm, rmapp, spte);
865 } 1093 }
866 } 1094 }
@@ -1032,151 +1260,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1260 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033} 1261}
1034 1262
1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1263/*
1264 * Remove the sp from shadow page cache, after call it,
1265 * we can not find this sp from the cache, and the shadow
1266 * page table is still valid.
1267 * It should be under the protection of mmu lock.
1268 */
1269static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
1036{ 1270{
1037 ASSERT(is_empty_shadow_page(sp->spt)); 1271 ASSERT(is_empty_shadow_page(sp->spt));
1038 hlist_del(&sp->hash_link); 1272 hlist_del(&sp->hash_link);
1039 list_del(&sp->link);
1040 free_page((unsigned long)sp->spt);
1041 if (!sp->role.direct) 1273 if (!sp->role.direct)
1042 free_page((unsigned long)sp->gfns); 1274 free_page((unsigned long)sp->gfns);
1043 kmem_cache_free(mmu_page_header_cache, sp);
1044 kvm_mod_used_mmu_pages(kvm, -1);
1045} 1275}
1046 1276
1047static unsigned kvm_page_table_hashfn(gfn_t gfn) 1277/*
1278 * Free the shadow page table and the sp, we can do it
1279 * out of the protection of mmu lock.
1280 */
1281static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
1048{ 1282{
1049 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1283 list_del(&sp->link);
1284 free_page((unsigned long)sp->spt);
1285 kmem_cache_free(mmu_page_header_cache, sp);
1050} 1286}
1051 1287
1052static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 1288static unsigned kvm_page_table_hashfn(gfn_t gfn)
1053 u64 *parent_pte, int direct)
1054{ 1289{
1055 struct kvm_mmu_page *sp; 1290 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
1056
1057 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
1058 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1059 if (!direct)
1060 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1061 PAGE_SIZE);
1062 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1063 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1065 sp->multimapped = 0;
1066 sp->parent_pte = parent_pte;
1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1068 return sp;
1069} 1291}
1070 1292
1071static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1293static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
1072 struct kvm_mmu_page *sp, u64 *parent_pte) 1294 struct kvm_mmu_page *sp, u64 *parent_pte)
1073{ 1295{
1074 struct kvm_pte_chain *pte_chain;
1075 struct hlist_node *node;
1076 int i;
1077
1078 if (!parent_pte) 1296 if (!parent_pte)
1079 return; 1297 return;
1080 if (!sp->multimapped) {
1081 u64 *old = sp->parent_pte;
1082 1298
1083 if (!old) { 1299 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
1084 sp->parent_pte = parent_pte;
1085 return;
1086 }
1087 sp->multimapped = 1;
1088 pte_chain = mmu_alloc_pte_chain(vcpu);
1089 INIT_HLIST_HEAD(&sp->parent_ptes);
1090 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1091 pte_chain->parent_ptes[0] = old;
1092 }
1093 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
1094 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
1095 continue;
1096 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
1097 if (!pte_chain->parent_ptes[i]) {
1098 pte_chain->parent_ptes[i] = parent_pte;
1099 return;
1100 }
1101 }
1102 pte_chain = mmu_alloc_pte_chain(vcpu);
1103 BUG_ON(!pte_chain);
1104 hlist_add_head(&pte_chain->link, &sp->parent_ptes);
1105 pte_chain->parent_ptes[0] = parent_pte;
1106} 1300}
1107 1301
1108static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1302static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
1109 u64 *parent_pte) 1303 u64 *parent_pte)
1110{ 1304{
1111 struct kvm_pte_chain *pte_chain; 1305 pte_list_remove(parent_pte, &sp->parent_ptes);
1112 struct hlist_node *node;
1113 int i;
1114
1115 if (!sp->multimapped) {
1116 BUG_ON(sp->parent_pte != parent_pte);
1117 sp->parent_pte = NULL;
1118 return;
1119 }
1120 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1121 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1122 if (!pte_chain->parent_ptes[i])
1123 break;
1124 if (pte_chain->parent_ptes[i] != parent_pte)
1125 continue;
1126 while (i + 1 < NR_PTE_CHAIN_ENTRIES
1127 && pte_chain->parent_ptes[i + 1]) {
1128 pte_chain->parent_ptes[i]
1129 = pte_chain->parent_ptes[i + 1];
1130 ++i;
1131 }
1132 pte_chain->parent_ptes[i] = NULL;
1133 if (i == 0) {
1134 hlist_del(&pte_chain->link);
1135 mmu_free_pte_chain(pte_chain);
1136 if (hlist_empty(&sp->parent_ptes)) {
1137 sp->multimapped = 0;
1138 sp->parent_pte = NULL;
1139 }
1140 }
1141 return;
1142 }
1143 BUG();
1144} 1306}
1145 1307
1146static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1308static void drop_parent_pte(struct kvm_mmu_page *sp,
1309 u64 *parent_pte)
1147{ 1310{
1148 struct kvm_pte_chain *pte_chain; 1311 mmu_page_remove_parent_pte(sp, parent_pte);
1149 struct hlist_node *node; 1312 mmu_spte_clear_no_track(parent_pte);
1150 struct kvm_mmu_page *parent_sp; 1313}
1151 int i;
1152
1153 if (!sp->multimapped && sp->parent_pte) {
1154 parent_sp = page_header(__pa(sp->parent_pte));
1155 fn(parent_sp, sp->parent_pte);
1156 return;
1157 }
1158
1159 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
1160 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
1161 u64 *spte = pte_chain->parent_ptes[i];
1162 1314
1163 if (!spte) 1315static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
1164 break; 1316 u64 *parent_pte, int direct)
1165 parent_sp = page_header(__pa(spte)); 1317{
1166 fn(parent_sp, spte); 1318 struct kvm_mmu_page *sp;
1167 } 1319 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
1320 sizeof *sp);
1321 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
1322 if (!direct)
1323 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
1324 PAGE_SIZE);
1325 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
1326 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
1327 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
1328 sp->parent_ptes = 0;
1329 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1330 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
1331 return sp;
1168} 1332}
1169 1333
1170static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1334static void mark_unsync(u64 *spte);
1171static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1335static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1172{ 1336{
1173 mmu_parent_walk(sp, mark_unsync); 1337 pte_list_walk(&sp->parent_ptes, mark_unsync);
1174} 1338}
1175 1339
1176static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1340static void mark_unsync(u64 *spte)
1177{ 1341{
1342 struct kvm_mmu_page *sp;
1178 unsigned int index; 1343 unsigned int index;
1179 1344
1345 sp = page_header(__pa(spte));
1180 index = spte - sp->spt; 1346 index = spte - sp->spt;
1181 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1347 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
1182 return; 1348 return;
@@ -1185,15 +1351,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
1185 kvm_mmu_mark_parents_unsync(sp); 1351 kvm_mmu_mark_parents_unsync(sp);
1186} 1352}
1187 1353
1188static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1189 struct kvm_mmu_page *sp)
1190{
1191 int i;
1192
1193 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1194 sp->spt[i] = shadow_trap_nonpresent_pte;
1195}
1196
1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1354static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1198 struct kvm_mmu_page *sp) 1355 struct kvm_mmu_page *sp)
1199{ 1356{
@@ -1475,6 +1632,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
1475 } 1632 }
1476} 1633}
1477 1634
1635static void init_shadow_page_table(struct kvm_mmu_page *sp)
1636{
1637 int i;
1638
1639 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1640 sp->spt[i] = 0ull;
1641}
1642
1478static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1643static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1479 gfn_t gfn, 1644 gfn_t gfn,
1480 gva_t gaddr, 1645 gva_t gaddr,
@@ -1537,10 +1702,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1537 1702
1538 account_shadowed(vcpu->kvm, gfn); 1703 account_shadowed(vcpu->kvm, gfn);
1539 } 1704 }
1540 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1705 init_shadow_page_table(sp);
1541 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1542 else
1543 nonpaging_prefetch_page(vcpu, sp);
1544 trace_kvm_mmu_get_page(sp, true); 1706 trace_kvm_mmu_get_page(sp, true);
1545 return sp; 1707 return sp;
1546} 1708}
@@ -1572,21 +1734,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1572 if (iterator->level < PT_PAGE_TABLE_LEVEL) 1734 if (iterator->level < PT_PAGE_TABLE_LEVEL)
1573 return false; 1735 return false;
1574 1736
1575 if (iterator->level == PT_PAGE_TABLE_LEVEL)
1576 if (is_large_pte(*iterator->sptep))
1577 return false;
1578
1579 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 1737 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
1580 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 1738 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
1581 return true; 1739 return true;
1582} 1740}
1583 1741
1584static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 1742static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
1743 u64 spte)
1585{ 1744{
1586 iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK; 1745 if (is_last_spte(spte, iterator->level)) {
1746 iterator->level = 0;
1747 return;
1748 }
1749
1750 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
1587 --iterator->level; 1751 --iterator->level;
1588} 1752}
1589 1753
1754static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
1755{
1756 return __shadow_walk_next(iterator, *iterator->sptep);
1757}
1758
1590static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1759static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1591{ 1760{
1592 u64 spte; 1761 u64 spte;
@@ -1594,13 +1763,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
1594 spte = __pa(sp->spt) 1763 spte = __pa(sp->spt)
1595 | PT_PRESENT_MASK | PT_ACCESSED_MASK 1764 | PT_PRESENT_MASK | PT_ACCESSED_MASK
1596 | PT_WRITABLE_MASK | PT_USER_MASK; 1765 | PT_WRITABLE_MASK | PT_USER_MASK;
1597 __set_spte(sptep, spte); 1766 mmu_spte_set(sptep, spte);
1598} 1767}
1599 1768
1600static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1769static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1601{ 1770{
1602 if (is_large_pte(*sptep)) { 1771 if (is_large_pte(*sptep)) {
1603 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1772 drop_spte(vcpu->kvm, sptep);
1604 kvm_flush_remote_tlbs(vcpu->kvm); 1773 kvm_flush_remote_tlbs(vcpu->kvm);
1605 } 1774 }
1606} 1775}
@@ -1622,38 +1791,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1622 if (child->role.access == direct_access) 1791 if (child->role.access == direct_access)
1623 return; 1792 return;
1624 1793
1625 mmu_page_remove_parent_pte(child, sptep); 1794 drop_parent_pte(child, sptep);
1626 __set_spte(sptep, shadow_trap_nonpresent_pte);
1627 kvm_flush_remote_tlbs(vcpu->kvm); 1795 kvm_flush_remote_tlbs(vcpu->kvm);
1628 } 1796 }
1629} 1797}
1630 1798
1799static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
1800 u64 *spte)
1801{
1802 u64 pte;
1803 struct kvm_mmu_page *child;
1804
1805 pte = *spte;
1806 if (is_shadow_present_pte(pte)) {
1807 if (is_last_spte(pte, sp->role.level))
1808 drop_spte(kvm, spte);
1809 else {
1810 child = page_header(pte & PT64_BASE_ADDR_MASK);
1811 drop_parent_pte(child, spte);
1812 }
1813 } else if (is_mmio_spte(pte))
1814 mmu_spte_clear_no_track(spte);
1815
1816 if (is_large_pte(pte))
1817 --kvm->stat.lpages;
1818}
1819
1631static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1820static void kvm_mmu_page_unlink_children(struct kvm *kvm,
1632 struct kvm_mmu_page *sp) 1821 struct kvm_mmu_page *sp)
1633{ 1822{
1634 unsigned i; 1823 unsigned i;
1635 u64 *pt; 1824
1636 u64 ent; 1825 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1637 1826 mmu_page_zap_pte(kvm, sp, sp->spt + i);
1638 pt = sp->spt;
1639
1640 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1641 ent = pt[i];
1642
1643 if (is_shadow_present_pte(ent)) {
1644 if (!is_last_spte(ent, sp->role.level)) {
1645 ent &= PT64_BASE_ADDR_MASK;
1646 mmu_page_remove_parent_pte(page_header(ent),
1647 &pt[i]);
1648 } else {
1649 if (is_large_pte(ent))
1650 --kvm->stat.lpages;
1651 drop_spte(kvm, &pt[i],
1652 shadow_trap_nonpresent_pte);
1653 }
1654 }
1655 pt[i] = shadow_trap_nonpresent_pte;
1656 }
1657} 1827}
1658 1828
1659static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1829static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1844,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
1674{ 1844{
1675 u64 *parent_pte; 1845 u64 *parent_pte;
1676 1846
1677 while (sp->multimapped || sp->parent_pte) { 1847 while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
1678 if (!sp->multimapped) 1848 drop_parent_pte(sp, parent_pte);
1679 parent_pte = sp->parent_pte;
1680 else {
1681 struct kvm_pte_chain *chain;
1682
1683 chain = container_of(sp->parent_ptes.first,
1684 struct kvm_pte_chain, link);
1685 parent_pte = chain->parent_ptes[0];
1686 }
1687 BUG_ON(!parent_pte);
1688 kvm_mmu_put_page(sp, parent_pte);
1689 __set_spte(parent_pte, shadow_trap_nonpresent_pte);
1690 }
1691} 1849}
1692 1850
1693static int mmu_zap_unsync_children(struct kvm *kvm, 1851static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1892,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1734 /* Count self */ 1892 /* Count self */
1735 ret++; 1893 ret++;
1736 list_move(&sp->link, invalid_list); 1894 list_move(&sp->link, invalid_list);
1895 kvm_mod_used_mmu_pages(kvm, -1);
1737 } else { 1896 } else {
1738 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1897 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1739 kvm_reload_remote_mmus(kvm); 1898 kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1903,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1744 return ret; 1903 return ret;
1745} 1904}
1746 1905
1906static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
1907{
1908 struct kvm_mmu_page *sp;
1909
1910 list_for_each_entry(sp, invalid_list, link)
1911 kvm_mmu_isolate_page(sp);
1912}
1913
1914static void free_pages_rcu(struct rcu_head *head)
1915{
1916 struct kvm_mmu_page *next, *sp;
1917
1918 sp = container_of(head, struct kvm_mmu_page, rcu);
1919 while (sp) {
1920 if (!list_empty(&sp->link))
1921 next = list_first_entry(&sp->link,
1922 struct kvm_mmu_page, link);
1923 else
1924 next = NULL;
1925 kvm_mmu_free_page(sp);
1926 sp = next;
1927 }
1928}
1929
1747static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1930static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1748 struct list_head *invalid_list) 1931 struct list_head *invalid_list)
1749{ 1932{
@@ -1754,10 +1937,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1754 1937
1755 kvm_flush_remote_tlbs(kvm); 1938 kvm_flush_remote_tlbs(kvm);
1756 1939
1940 if (atomic_read(&kvm->arch.reader_counter)) {
1941 kvm_mmu_isolate_pages(invalid_list);
1942 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1943 list_del_init(invalid_list);
1944
1945 trace_kvm_mmu_delay_free_pages(sp);
1946 call_rcu(&sp->rcu, free_pages_rcu);
1947 return;
1948 }
1949
1757 do { 1950 do {
1758 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1951 sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
1759 WARN_ON(!sp->role.invalid || sp->root_count); 1952 WARN_ON(!sp->role.invalid || sp->root_count);
1760 kvm_mmu_free_page(kvm, sp); 1953 kvm_mmu_isolate_page(sp);
1954 kvm_mmu_free_page(sp);
1761 } while (!list_empty(invalid_list)); 1955 } while (!list_empty(invalid_list));
1762 1956
1763} 1957}
@@ -1783,8 +1977,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1783 page = container_of(kvm->arch.active_mmu_pages.prev, 1977 page = container_of(kvm->arch.active_mmu_pages.prev,
1784 struct kvm_mmu_page, link); 1978 struct kvm_mmu_page, link);
1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); 1979 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1787 } 1980 }
1981 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 1982 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1789 } 1983 }
1790 1984
@@ -1833,20 +2027,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1833 __set_bit(slot, sp->slot_bitmap); 2027 __set_bit(slot, sp->slot_bitmap);
1834} 2028}
1835 2029
1836static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1837{
1838 int i;
1839 u64 *pt = sp->spt;
1840
1841 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1842 return;
1843
1844 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1845 if (pt[i] == shadow_notrap_nonpresent_pte)
1846 __set_spte(&pt[i], shadow_trap_nonpresent_pte);
1847 }
1848}
1849
1850/* 2030/*
1851 * The function is based on mtrr_type_lookup() in 2031 * The function is based on mtrr_type_lookup() in
1852 * arch/x86/kernel/cpu/mtrr/generic.c 2032 * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2139,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1959 sp->unsync = 1; 2139 sp->unsync = 1;
1960 2140
1961 kvm_mmu_mark_parents_unsync(sp); 2141 kvm_mmu_mark_parents_unsync(sp);
1962 mmu_convert_notrap(sp);
1963} 2142}
1964 2143
1965static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 2144static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2002,13 +2181,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2002 2181
2003static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2182static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 unsigned pte_access, int user_fault, 2183 unsigned pte_access, int user_fault,
2005 int write_fault, int dirty, int level, 2184 int write_fault, int level,
2006 gfn_t gfn, pfn_t pfn, bool speculative, 2185 gfn_t gfn, pfn_t pfn, bool speculative,
2007 bool can_unsync, bool host_writable) 2186 bool can_unsync, bool host_writable)
2008{ 2187{
2009 u64 spte, entry = *sptep; 2188 u64 spte, entry = *sptep;
2010 int ret = 0; 2189 int ret = 0;
2011 2190
2191 if (set_mmio_spte(sptep, gfn, pfn, pte_access))
2192 return 0;
2193
2012 /* 2194 /*
2013 * We don't set the accessed bit, since we sometimes want to see 2195 * We don't set the accessed bit, since we sometimes want to see
2014 * whether the guest actually used the pte (in order to detect 2196 * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2199,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2017 spte = PT_PRESENT_MASK; 2199 spte = PT_PRESENT_MASK;
2018 if (!speculative) 2200 if (!speculative)
2019 spte |= shadow_accessed_mask; 2201 spte |= shadow_accessed_mask;
2020 if (!dirty) 2202
2021 pte_access &= ~ACC_WRITE_MASK;
2022 if (pte_access & ACC_EXEC_MASK) 2203 if (pte_access & ACC_EXEC_MASK)
2023 spte |= shadow_x_mask; 2204 spte |= shadow_x_mask;
2024 else 2205 else
@@ -2045,15 +2226,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2045 if (level > PT_PAGE_TABLE_LEVEL && 2226 if (level > PT_PAGE_TABLE_LEVEL &&
2046 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2227 has_wrprotected_page(vcpu->kvm, gfn, level)) {
2047 ret = 1; 2228 ret = 1;
2048 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2229 drop_spte(vcpu->kvm, sptep);
2049 goto done; 2230 goto done;
2050 } 2231 }
2051 2232
2052 spte |= PT_WRITABLE_MASK; 2233 spte |= PT_WRITABLE_MASK;
2053 2234
2054 if (!vcpu->arch.mmu.direct_map 2235 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK)) 2236 && !(pte_access & ACC_WRITE_MASK)) {
2056 spte &= ~PT_USER_MASK; 2237 spte &= ~PT_USER_MASK;
2238 /*
2239 * If we converted a user page to a kernel page,
2240 * so that the kernel can write to it when cr0.wp=0,
2241 * then we should prevent the kernel from executing it
2242 * if SMEP is enabled.
2243 */
2244 if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
2245 spte |= PT64_NX_MASK;
2246 }
2057 2247
2058 /* 2248 /*
2059 * Optimization: for pte sync, if spte was writable the hash 2249 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2268,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2078 mark_page_dirty(vcpu->kvm, gfn); 2268 mark_page_dirty(vcpu->kvm, gfn);
2079 2269
2080set_pte: 2270set_pte:
2081 update_spte(sptep, spte); 2271 mmu_spte_update(sptep, spte);
2082 /* 2272 /*
2083 * If we overwrite a writable spte with a read-only one we 2273 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect 2274 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2283,8 @@ done:
2093 2283
2094static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2284static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2095 unsigned pt_access, unsigned pte_access, 2285 unsigned pt_access, unsigned pte_access,
2096 int user_fault, int write_fault, int dirty, 2286 int user_fault, int write_fault,
2097 int *ptwrite, int level, gfn_t gfn, 2287 int *emulate, int level, gfn_t gfn,
2098 pfn_t pfn, bool speculative, 2288 pfn_t pfn, bool speculative,
2099 bool host_writable) 2289 bool host_writable)
2100{ 2290{
@@ -2117,26 +2307,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2117 u64 pte = *sptep; 2307 u64 pte = *sptep;
2118 2308
2119 child = page_header(pte & PT64_BASE_ADDR_MASK); 2309 child = page_header(pte & PT64_BASE_ADDR_MASK);
2120 mmu_page_remove_parent_pte(child, sptep); 2310 drop_parent_pte(child, sptep);
2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2122 kvm_flush_remote_tlbs(vcpu->kvm); 2311 kvm_flush_remote_tlbs(vcpu->kvm);
2123 } else if (pfn != spte_to_pfn(*sptep)) { 2312 } else if (pfn != spte_to_pfn(*sptep)) {
2124 pgprintk("hfn old %llx new %llx\n", 2313 pgprintk("hfn old %llx new %llx\n",
2125 spte_to_pfn(*sptep), pfn); 2314 spte_to_pfn(*sptep), pfn);
2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2315 drop_spte(vcpu->kvm, sptep);
2127 kvm_flush_remote_tlbs(vcpu->kvm); 2316 kvm_flush_remote_tlbs(vcpu->kvm);
2128 } else 2317 } else
2129 was_rmapped = 1; 2318 was_rmapped = 1;
2130 } 2319 }
2131 2320
2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2321 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2133 dirty, level, gfn, pfn, speculative, true, 2322 level, gfn, pfn, speculative, true,
2134 host_writable)) { 2323 host_writable)) {
2135 if (write_fault) 2324 if (write_fault)
2136 *ptwrite = 1; 2325 *emulate = 1;
2137 kvm_mmu_flush_tlb(vcpu); 2326 kvm_mmu_flush_tlb(vcpu);
2138 } 2327 }
2139 2328
2329 if (unlikely(is_mmio_spte(*sptep) && emulate))
2330 *emulate = 1;
2331
2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2332 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2333 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2142 is_large_pte(*sptep)? "2MB" : "4kB", 2334 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2337,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2145 if (!was_rmapped && is_large_pte(*sptep)) 2337 if (!was_rmapped && is_large_pte(*sptep))
2146 ++vcpu->kvm->stat.lpages; 2338 ++vcpu->kvm->stat.lpages;
2147 2339
2148 page_header_update_slot(vcpu->kvm, sptep, gfn); 2340 if (is_shadow_present_pte(*sptep)) {
2149 if (!was_rmapped) { 2341 page_header_update_slot(vcpu->kvm, sptep, gfn);
2150 rmap_count = rmap_add(vcpu, sptep, gfn); 2342 if (!was_rmapped) {
2151 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2343 rmap_count = rmap_add(vcpu, sptep, gfn);
2152 rmap_recycle(vcpu, sptep, gfn); 2344 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
2345 rmap_recycle(vcpu, sptep, gfn);
2346 }
2153 } 2347 }
2154 kvm_release_pfn_clean(pfn); 2348 kvm_release_pfn_clean(pfn);
2155 if (speculative) { 2349 if (speculative) {
@@ -2170,8 +2364,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2170 2364
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2365 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) { 2366 if (!slot) {
2173 get_page(bad_page); 2367 get_page(fault_page);
2174 return page_to_pfn(bad_page); 2368 return page_to_pfn(fault_page);
2175 } 2369 }
2176 2370
2177 hva = gfn_to_hva_memslot(slot, gfn); 2371 hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2392,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2198 2392
2199 for (i = 0; i < ret; i++, gfn++, start++) 2393 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL, 2394 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL, 2395 access, 0, 0, NULL,
2202 sp->role.level, gfn, 2396 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true); 2397 page_to_pfn(pages[i]), true, true);
2204 2398
@@ -2217,7 +2411,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2217 spte = sp->spt + i; 2411 spte = sp->spt + i;
2218 2412
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2413 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { 2414 if (is_shadow_present_pte(*spte) || spte == sptep) {
2221 if (!start) 2415 if (!start)
2222 continue; 2416 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2417 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2448,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2254{ 2448{
2255 struct kvm_shadow_walk_iterator iterator; 2449 struct kvm_shadow_walk_iterator iterator;
2256 struct kvm_mmu_page *sp; 2450 struct kvm_mmu_page *sp;
2257 int pt_write = 0; 2451 int emulate = 0;
2258 gfn_t pseudo_gfn; 2452 gfn_t pseudo_gfn;
2259 2453
2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2454 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2456,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2262 unsigned pte_access = ACC_ALL; 2456 unsigned pte_access = ACC_ALL;
2263 2457
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 2458 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2265 0, write, 1, &pt_write, 2459 0, write, &emulate,
2266 level, gfn, pfn, prefault, map_writable); 2460 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep); 2461 direct_pte_prefetch(vcpu, iterator.sptep);
2268 ++vcpu->stat.pf_fixed; 2462 ++vcpu->stat.pf_fixed;
2269 break; 2463 break;
2270 } 2464 }
2271 2465
2272 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2466 if (!is_shadow_present_pte(*iterator.sptep)) {
2273 u64 base_addr = iterator.addr; 2467 u64 base_addr = iterator.addr;
2274 2468
2275 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2469 base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2477,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2283 return -ENOMEM; 2477 return -ENOMEM;
2284 } 2478 }
2285 2479
2286 __set_spte(iterator.sptep, 2480 mmu_spte_set(iterator.sptep,
2287 __pa(sp->spt) 2481 __pa(sp->spt)
2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2482 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2289 | shadow_user_mask | shadow_x_mask 2483 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask); 2484 | shadow_accessed_mask);
2291 } 2485 }
2292 } 2486 }
2293 return pt_write; 2487 return emulate;
2294} 2488}
2295 2489
2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2490static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2500,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
2306 send_sig_info(SIGBUS, &info, tsk); 2500 send_sig_info(SIGBUS, &info, tsk);
2307} 2501}
2308 2502
2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2503static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
2310{ 2504{
2311 kvm_release_pfn_clean(pfn); 2505 kvm_release_pfn_clean(pfn);
2312 if (is_hwpoison_pfn(pfn)) { 2506 if (is_hwpoison_pfn(pfn)) {
2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); 2507 kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
2314 return 0; 2508 return 0;
2315 } else if (is_fault_pfn(pfn)) 2509 }
2316 return -EFAULT;
2317 2510
2318 return 1; 2511 return -EFAULT;
2319} 2512}
2320 2513
2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2514static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2553,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2360 } 2553 }
2361} 2554}
2362 2555
2556static bool mmu_invalid_pfn(pfn_t pfn)
2557{
2558 return unlikely(is_invalid_pfn(pfn));
2559}
2560
2561static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
2562 pfn_t pfn, unsigned access, int *ret_val)
2563{
2564 bool ret = true;
2565
2566 /* The pfn is invalid, report the error! */
2567 if (unlikely(is_invalid_pfn(pfn))) {
2568 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
2569 goto exit;
2570 }
2571
2572 if (unlikely(is_noslot_pfn(pfn)))
2573 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
2574
2575 ret = false;
2576exit:
2577 return ret;
2578}
2579
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2580static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2581 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365 2582
@@ -2394,9 +2611,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 2611 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0; 2612 return 0;
2396 2613
2397 /* mmio */ 2614 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
2398 if (is_error_pfn(pfn)) 2615 return r;
2399 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2400 2616
2401 spin_lock(&vcpu->kvm->mmu_lock); 2617 spin_lock(&vcpu->kvm->mmu_lock);
2402 if (mmu_notifier_retry(vcpu, mmu_seq)) 2618 if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2839,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2839 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2624 return; 2840 return;
2625 2841
2842 vcpu_clear_mmio_info(vcpu, ~0ul);
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 2843 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 2844 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2628 hpa_t root = vcpu->arch.mmu.root_hpa; 2845 hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2884,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); 2884 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668} 2885}
2669 2886
2887static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2888{
2889 if (direct)
2890 return vcpu_match_mmio_gpa(vcpu, addr);
2891
2892 return vcpu_match_mmio_gva(vcpu, addr);
2893}
2894
2895
2896/*
2897 * On direct hosts, the last spte is only allows two states
2898 * for mmio page fault:
2899 * - It is the mmio spte
2900 * - It is zapped or it is being zapped.
2901 *
2902 * This function completely checks the spte when the last spte
2903 * is not the mmio spte.
2904 */
2905static bool check_direct_spte_mmio_pf(u64 spte)
2906{
2907 return __check_direct_spte_mmio_pf(spte);
2908}
2909
2910static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
2911{
2912 struct kvm_shadow_walk_iterator iterator;
2913 u64 spte = 0ull;
2914
2915 walk_shadow_page_lockless_begin(vcpu);
2916 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
2917 if (!is_shadow_present_pte(spte))
2918 break;
2919 walk_shadow_page_lockless_end(vcpu);
2920
2921 return spte;
2922}
2923
2924/*
2925 * If it is a real mmio page fault, return 1 and emulat the instruction
2926 * directly, return 0 to let CPU fault again on the address, -1 is
2927 * returned if bug is detected.
2928 */
2929int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
2930{
2931 u64 spte;
2932
2933 if (quickly_check_mmio_pf(vcpu, addr, direct))
2934 return 1;
2935
2936 spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
2937
2938 if (is_mmio_spte(spte)) {
2939 gfn_t gfn = get_mmio_spte_gfn(spte);
2940 unsigned access = get_mmio_spte_access(spte);
2941
2942 if (direct)
2943 addr = 0;
2944
2945 trace_handle_mmio_page_fault(addr, gfn, access);
2946 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
2947 return 1;
2948 }
2949
2950 /*
2951 * It's ok if the gva is remapped by other cpus on shadow guest,
2952 * it's a BUG if the gfn is not a mmio page.
2953 */
2954 if (direct && !check_direct_spte_mmio_pf(spte))
2955 return -1;
2956
2957 /*
2958 * If the page table is zapped by other cpus, let CPU fault again on
2959 * the address.
2960 */
2961 return 0;
2962}
2963EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
2964
2965static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
2966 u32 error_code, bool direct)
2967{
2968 int ret;
2969
2970 ret = handle_mmio_page_fault_common(vcpu, addr, direct);
2971 WARN_ON(ret < 0);
2972 return ret;
2973}
2974
2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2975static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2671 u32 error_code, bool prefault) 2976 u32 error_code, bool prefault)
2672{ 2977{
@@ -2674,6 +2979,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2674 int r; 2979 int r;
2675 2980
2676 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 2981 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
2982
2983 if (unlikely(error_code & PFERR_RSVD_MASK))
2984 return handle_mmio_page_fault(vcpu, gva, error_code, true);
2985
2677 r = mmu_topup_memory_caches(vcpu); 2986 r = mmu_topup_memory_caches(vcpu);
2678 if (r) 2987 if (r)
2679 return r; 2988 return r;
@@ -2750,6 +3059,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2750 ASSERT(vcpu); 3059 ASSERT(vcpu);
2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3060 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
2752 3061
3062 if (unlikely(error_code & PFERR_RSVD_MASK))
3063 return handle_mmio_page_fault(vcpu, gpa, error_code, true);
3064
2753 r = mmu_topup_memory_caches(vcpu); 3065 r = mmu_topup_memory_caches(vcpu);
2754 if (r) 3066 if (r)
2755 return r; 3067 return r;
@@ -2767,9 +3079,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3079 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0; 3080 return 0;
2769 3081
2770 /* mmio */ 3082 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
2771 if (is_error_pfn(pfn)) 3083 return r;
2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 3084
2773 spin_lock(&vcpu->kvm->mmu_lock); 3085 spin_lock(&vcpu->kvm->mmu_lock);
2774 if (mmu_notifier_retry(vcpu, mmu_seq)) 3086 if (mmu_notifier_retry(vcpu, mmu_seq))
2775 goto out_unlock; 3087 goto out_unlock;
@@ -2800,7 +3112,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2800 context->page_fault = nonpaging_page_fault; 3112 context->page_fault = nonpaging_page_fault;
2801 context->gva_to_gpa = nonpaging_gva_to_gpa; 3113 context->gva_to_gpa = nonpaging_gva_to_gpa;
2802 context->free = nonpaging_free; 3114 context->free = nonpaging_free;
2803 context->prefetch_page = nonpaging_prefetch_page;
2804 context->sync_page = nonpaging_sync_page; 3115 context->sync_page = nonpaging_sync_page;
2805 context->invlpg = nonpaging_invlpg; 3116 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte; 3117 context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3159,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3159 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2849} 3160}
2850 3161
3162static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
3163 int *nr_present)
3164{
3165 if (unlikely(is_mmio_spte(*sptep))) {
3166 if (gfn != get_mmio_spte_gfn(*sptep)) {
3167 mmu_spte_clear_no_track(sptep);
3168 return true;
3169 }
3170
3171 (*nr_present)++;
3172 mark_mmio_spte(sptep, gfn, access);
3173 return true;
3174 }
3175
3176 return false;
3177}
3178
2851#define PTTYPE 64 3179#define PTTYPE 64
2852#include "paging_tmpl.h" 3180#include "paging_tmpl.h"
2853#undef PTTYPE 3181#undef PTTYPE
@@ -2930,7 +3258,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2930 context->new_cr3 = paging_new_cr3; 3258 context->new_cr3 = paging_new_cr3;
2931 context->page_fault = paging64_page_fault; 3259 context->page_fault = paging64_page_fault;
2932 context->gva_to_gpa = paging64_gva_to_gpa; 3260 context->gva_to_gpa = paging64_gva_to_gpa;
2933 context->prefetch_page = paging64_prefetch_page;
2934 context->sync_page = paging64_sync_page; 3261 context->sync_page = paging64_sync_page;
2935 context->invlpg = paging64_invlpg; 3262 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte; 3263 context->update_pte = paging64_update_pte;
@@ -2959,7 +3286,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
2959 context->page_fault = paging32_page_fault; 3286 context->page_fault = paging32_page_fault;
2960 context->gva_to_gpa = paging32_gva_to_gpa; 3287 context->gva_to_gpa = paging32_gva_to_gpa;
2961 context->free = paging_free; 3288 context->free = paging_free;
2962 context->prefetch_page = paging32_prefetch_page;
2963 context->sync_page = paging32_sync_page; 3289 context->sync_page = paging32_sync_page;
2964 context->invlpg = paging32_invlpg; 3290 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte; 3291 context->update_pte = paging32_update_pte;
@@ -2984,7 +3310,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2984 context->new_cr3 = nonpaging_new_cr3; 3310 context->new_cr3 = nonpaging_new_cr3;
2985 context->page_fault = tdp_page_fault; 3311 context->page_fault = tdp_page_fault;
2986 context->free = nonpaging_free; 3312 context->free = nonpaging_free;
2987 context->prefetch_page = nonpaging_prefetch_page;
2988 context->sync_page = nonpaging_sync_page; 3313 context->sync_page = nonpaging_sync_page;
2989 context->invlpg = nonpaging_invlpg; 3314 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte; 3315 context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3348,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3348int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3024{ 3349{
3025 int r; 3350 int r;
3351 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
3026 ASSERT(vcpu); 3352 ASSERT(vcpu);
3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3353 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
3028 3354
@@ -3037,6 +3363,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
3037 3363
3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3364 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3365 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
3366 vcpu->arch.mmu.base_role.smep_andnot_wp
3367 = smep && !is_write_protection(vcpu);
3040 3368
3041 return r; 3369 return r;
3042} 3370}
@@ -3141,27 +3469,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
3141} 3469}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload); 3470EXPORT_SYMBOL_GPL(kvm_mmu_unload);
3143 3471
3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3145 struct kvm_mmu_page *sp,
3146 u64 *spte)
3147{
3148 u64 pte;
3149 struct kvm_mmu_page *child;
3150
3151 pte = *spte;
3152 if (is_shadow_present_pte(pte)) {
3153 if (is_last_spte(pte, sp->role.level))
3154 drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
3155 else {
3156 child = page_header(pte & PT64_BASE_ADDR_MASK);
3157 mmu_page_remove_parent_pte(child, spte);
3158 }
3159 }
3160 __set_spte(spte, shadow_trap_nonpresent_pte);
3161 if (is_large_pte(pte))
3162 --vcpu->kvm->stat.lpages;
3163}
3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3472static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, u64 *spte, 3473 struct kvm_mmu_page *sp, u64 *spte,
3167 const void *new) 3474 const void *new)
@@ -3233,6 +3540,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3233 int level, npte, invlpg_counter, r, flooded = 0; 3540 int level, npte, invlpg_counter, r, flooded = 0;
3234 bool remote_flush, local_flush, zap_page; 3541 bool remote_flush, local_flush, zap_page;
3235 3542
3543 /*
3544 * If we don't have indirect shadow pages, it means no page is
3545 * write-protected, so we can exit simply.
3546 */
3547 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
3548 return;
3549
3236 zap_page = remote_flush = local_flush = false; 3550 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa); 3551 offset = offset_in_page(gpa);
3238 3552
@@ -3336,7 +3650,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3336 spte = &sp->spt[page_offset / sizeof(*spte)]; 3650 spte = &sp->spt[page_offset / sizeof(*spte)];
3337 while (npte--) { 3651 while (npte--) {
3338 entry = *spte; 3652 entry = *spte;
3339 mmu_pte_write_zap_pte(vcpu, sp, spte); 3653 mmu_page_zap_pte(vcpu->kvm, sp, spte);
3340 if (gentry && 3654 if (gentry &&
3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3655 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3342 & mask.word)) 3656 & mask.word))
@@ -3380,9 +3694,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3694 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
3381 struct kvm_mmu_page, link); 3695 struct kvm_mmu_page, link);
3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3696 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3384 ++vcpu->kvm->stat.mmu_recycled; 3697 ++vcpu->kvm->stat.mmu_recycled;
3385 } 3698 }
3699 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3386} 3700}
3387 3701
3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 3702int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3820,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3506 continue; 3820 continue;
3507 3821
3508 if (is_large_pte(pt[i])) { 3822 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i], 3823 drop_spte(kvm, &pt[i]);
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages; 3824 --kvm->stat.lpages;
3512 continue; 3825 continue;
3513 } 3826 }
3514 3827
3515 /* avoid RMW */ 3828 /* avoid RMW */
3516 if (is_writable_pte(pt[i])) 3829 if (is_writable_pte(pt[i]))
3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); 3830 mmu_spte_update(&pt[i],
3831 pt[i] & ~PT_WRITABLE_MASK);
3518 } 3832 }
3519 } 3833 }
3520 kvm_flush_remote_tlbs(kvm); 3834 kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3904,18 @@ static struct shrinker mmu_shrinker = {
3590 3904
3591static void mmu_destroy_caches(void) 3905static void mmu_destroy_caches(void)
3592{ 3906{
3593 if (pte_chain_cache) 3907 if (pte_list_desc_cache)
3594 kmem_cache_destroy(pte_chain_cache); 3908 kmem_cache_destroy(pte_list_desc_cache);
3595 if (rmap_desc_cache)
3596 kmem_cache_destroy(rmap_desc_cache);
3597 if (mmu_page_header_cache) 3909 if (mmu_page_header_cache)
3598 kmem_cache_destroy(mmu_page_header_cache); 3910 kmem_cache_destroy(mmu_page_header_cache);
3599} 3911}
3600 3912
3601int kvm_mmu_module_init(void) 3913int kvm_mmu_module_init(void)
3602{ 3914{
3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3915 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
3604 sizeof(struct kvm_pte_chain), 3916 sizeof(struct pte_list_desc),
3605 0, 0, NULL);
3606 if (!pte_chain_cache)
3607 goto nomem;
3608 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
3609 sizeof(struct kvm_rmap_desc),
3610 0, 0, NULL); 3917 0, 0, NULL);
3611 if (!rmap_desc_cache) 3918 if (!pte_list_desc_cache)
3612 goto nomem; 3919 goto nomem;
3613 3920
3614 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 3921 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4082,17 @@ out:
3775int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) 4082int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3776{ 4083{
3777 struct kvm_shadow_walk_iterator iterator; 4084 struct kvm_shadow_walk_iterator iterator;
4085 u64 spte;
3778 int nr_sptes = 0; 4086 int nr_sptes = 0;
3779 4087
3780 spin_lock(&vcpu->kvm->mmu_lock); 4088 walk_shadow_page_lockless_begin(vcpu);
3781 for_each_shadow_entry(vcpu, addr, iterator) { 4089 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
3782 sptes[iterator.level-1] = *iterator.sptep; 4090 sptes[iterator.level-1] = spte;
3783 nr_sptes++; 4091 nr_sptes++;
3784 if (!is_shadow_present_pte(*iterator.sptep)) 4092 if (!is_shadow_present_pte(spte))
3785 break; 4093 break;
3786 } 4094 }
3787 spin_unlock(&vcpu->kvm->mmu_lock); 4095 walk_shadow_page_lockless_end(vcpu);
3788 4096
3789 return nr_sptes; 4097 return nr_sptes;
3790} 4098}