aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c1371
1 files changed, 780 insertions, 591 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..aee38623b768 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -49,15 +51,25 @@
49 */ 51 */
50bool tdp_enabled = false; 52bool tdp_enabled = false;
51 53
52#undef MMU_DEBUG 54enum {
55 AUDIT_PRE_PAGE_FAULT,
56 AUDIT_POST_PAGE_FAULT,
57 AUDIT_PRE_PTE_WRITE,
58 AUDIT_POST_PTE_WRITE,
59 AUDIT_PRE_SYNC,
60 AUDIT_POST_SYNC
61};
53 62
54#undef AUDIT 63char *audit_point_name[] = {
64 "pre page fault",
65 "post page fault",
66 "pre pte write",
67 "post pte write",
68 "pre sync",
69 "post sync"
70};
55 71
56#ifdef AUDIT 72#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 73
62#ifdef MMU_DEBUG 74#ifdef MMU_DEBUG
63 75
@@ -71,7 +83,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 83
72#endif 84#endif
73 85
74#if defined(MMU_DEBUG) || defined(AUDIT) 86#ifdef MMU_DEBUG
75static int dbg = 0; 87static int dbg = 0;
76module_param(dbg, bool, 0644); 88module_param(dbg, bool, 0644);
77#endif 89#endif
@@ -89,6 +101,8 @@ module_param(oos_shadow, bool, 0644);
89 } 101 }
90#endif 102#endif
91 103
104#define PTE_PREFETCH_NUM 8
105
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 106#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 107#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 108
@@ -97,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
97#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
99 113
100#define PT64_LEVEL_MASK(level) \
101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
102
103#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
105 116
@@ -109,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
109#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
111 122
112#define PT32_LEVEL_MASK(level) \
113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
116 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -178,10 +187,10 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 187static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 188static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 189static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages;
181 191
182static u64 __read_mostly shadow_trap_nonpresent_pte; 192static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 193static u64 __read_mostly shadow_notrap_nonpresent_pte;
184static u64 __read_mostly shadow_base_present_pte;
185static u64 __read_mostly shadow_nx_mask; 194static u64 __read_mostly shadow_nx_mask;
186static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
187static u64 __read_mostly shadow_user_mask; 196static u64 __read_mostly shadow_user_mask;
@@ -200,12 +209,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
200} 209}
201EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
202 211
203void kvm_mmu_set_base_ptes(u64 base_pte)
204{
205 shadow_base_present_pte = base_pte;
206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
208
209void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210 u64 dirty_mask, u64 nx_mask, u64 x_mask) 213 u64 dirty_mask, u64 nx_mask, u64 x_mask)
211{ 214{
@@ -299,18 +302,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 302#endif
300} 303}
301 304
305static bool spte_has_volatile_bits(u64 spte)
306{
307 if (!shadow_accessed_mask)
308 return false;
309
310 if (!is_shadow_present_pte(spte))
311 return false;
312
313 if ((spte & shadow_accessed_mask) &&
314 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
315 return false;
316
317 return true;
318}
319
320static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
321{
322 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323}
324
302static void update_spte(u64 *sptep, u64 new_spte) 325static void update_spte(u64 *sptep, u64 new_spte)
303{ 326{
304 u64 old_spte; 327 u64 mask, old_spte = *sptep;
328
329 WARN_ON(!is_rmap_spte(new_spte));
330
331 new_spte |= old_spte & shadow_dirty_mask;
332
333 mask = shadow_accessed_mask;
334 if (is_writable_pte(old_spte))
335 mask |= shadow_dirty_mask;
305 336
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte); 338 __set_spte(sptep, new_spte);
309 else { 339 else
310 old_spte = __xchg_spte(sptep, new_spte); 340 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 341
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 342 if (!shadow_accessed_mask)
313 } 343 return;
344
345 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 349}
315 350
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -339,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
340 int min) 375 int min)
341{ 376{
342 struct page *page; 377 void *page;
343 378
344 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
345 return 0; 380 return 0;
346 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
347 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
348 if (!page) 383 if (!page)
349 return -ENOMEM; 384 return -ENOMEM;
350 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
351 } 386 }
352 return 0; 387 return 0;
353} 388}
@@ -367,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 402 if (r)
368 goto out; 403 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 406 if (r)
372 goto out; 407 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -437,46 +472,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
437} 472}
438 473
439/* 474/*
440 * Return the pointer to the largepage write count for a given 475 * Return the pointer to the large page information for a given gfn,
441 * gfn, handling slots that are not large page aligned. 476 * handling slots that are not large page aligned.
442 */ 477 */
443static int *slot_largepage_idx(gfn_t gfn, 478static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
444 struct kvm_memory_slot *slot, 479 struct kvm_memory_slot *slot,
445 int level) 480 int level)
446{ 481{
447 unsigned long idx; 482 unsigned long idx;
448 483
449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 484 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 485 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
451 return &slot->lpage_info[level - 2][idx].write_count; 486 return &slot->lpage_info[level - 2][idx];
452} 487}
453 488
454static void account_shadowed(struct kvm *kvm, gfn_t gfn) 489static void account_shadowed(struct kvm *kvm, gfn_t gfn)
455{ 490{
456 struct kvm_memory_slot *slot; 491 struct kvm_memory_slot *slot;
457 int *write_count; 492 struct kvm_lpage_info *linfo;
458 int i; 493 int i;
459 494
460 slot = gfn_to_memslot(kvm, gfn); 495 slot = gfn_to_memslot(kvm, gfn);
461 for (i = PT_DIRECTORY_LEVEL; 496 for (i = PT_DIRECTORY_LEVEL;
462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 497 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
463 write_count = slot_largepage_idx(gfn, slot, i); 498 linfo = lpage_info_slot(gfn, slot, i);
464 *write_count += 1; 499 linfo->write_count += 1;
465 } 500 }
466} 501}
467 502
468static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
469{ 504{
470 struct kvm_memory_slot *slot; 505 struct kvm_memory_slot *slot;
471 int *write_count; 506 struct kvm_lpage_info *linfo;
472 int i; 507 int i;
473 508
474 slot = gfn_to_memslot(kvm, gfn); 509 slot = gfn_to_memslot(kvm, gfn);
475 for (i = PT_DIRECTORY_LEVEL; 510 for (i = PT_DIRECTORY_LEVEL;
476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 511 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
477 write_count = slot_largepage_idx(gfn, slot, i); 512 linfo = lpage_info_slot(gfn, slot, i);
478 *write_count -= 1; 513 linfo->write_count -= 1;
479 WARN_ON(*write_count < 0); 514 WARN_ON(linfo->write_count < 0);
480 } 515 }
481} 516}
482 517
@@ -485,12 +520,12 @@ static int has_wrprotected_page(struct kvm *kvm,
485 int level) 520 int level)
486{ 521{
487 struct kvm_memory_slot *slot; 522 struct kvm_memory_slot *slot;
488 int *largepage_idx; 523 struct kvm_lpage_info *linfo;
489 524
490 slot = gfn_to_memslot(kvm, gfn); 525 slot = gfn_to_memslot(kvm, gfn);
491 if (slot) { 526 if (slot) {
492 largepage_idx = slot_largepage_idx(gfn, slot, level); 527 linfo = lpage_info_slot(gfn, slot, level);
493 return *largepage_idx; 528 return linfo->write_count;
494 } 529 }
495 530
496 return 1; 531 return 1;
@@ -514,14 +549,28 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
514 return ret; 549 return ret;
515} 550}
516 551
517static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
518{ 555{
519 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
520 int host_level, level, max_level;
521 557
522 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
523 if (slot && slot->dirty_bitmap) 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
524 return PT_PAGE_TABLE_LEVEL; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
569}
570
571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
572{
573 int host_level, level, max_level;
525 574
526 host_level = host_mapping_level(vcpu->kvm, large_gfn); 575 host_level = host_mapping_level(vcpu->kvm, large_gfn);
527 576
@@ -545,16 +594,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
546{ 595{
547 struct kvm_memory_slot *slot; 596 struct kvm_memory_slot *slot;
548 unsigned long idx; 597 struct kvm_lpage_info *linfo;
549 598
550 slot = gfn_to_memslot(kvm, gfn); 599 slot = gfn_to_memslot(kvm, gfn);
551 if (likely(level == PT_PAGE_TABLE_LEVEL)) 600 if (likely(level == PT_PAGE_TABLE_LEVEL))
552 return &slot->rmap[gfn - slot->base_gfn]; 601 return &slot->rmap[gfn - slot->base_gfn];
553 602
554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 603 linfo = lpage_info_slot(gfn, slot, level);
555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
556 604
557 return &slot->lpage_info[level - 2][idx].rmap_pde; 605 return &linfo->rmap_pde;
558} 606}
559 607
560/* 608/*
@@ -591,6 +639,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 639 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 640 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 641 *rmapp = (unsigned long)desc | 1;
642 ++count;
594 } else { 643 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +652,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 652 desc = desc->more;
604 } 653 }
605 for (i = 0; desc->sptes[i]; ++i) 654 for (i = 0; desc->sptes[i]; ++i)
606 ; 655 ++count;
607 desc->sptes[i] = spte; 656 desc->sptes[i] = spte;
608 } 657 }
609 return count; 658 return count;
@@ -645,18 +694,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 696 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 698 BUG();
650 } else if (!(*rmapp & 1)) { 699 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 700 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 701 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 703 BUG();
656 } 704 }
657 *rmapp = 0; 705 *rmapp = 0;
658 } else { 706 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 707 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 709 prev_desc = NULL;
662 while (desc) { 710 while (desc) {
@@ -670,35 +718,36 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 718 prev_desc = desc;
671 desc = desc->more; 719 desc = desc->more;
672 } 720 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 721 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 722 BUG();
675 } 723 }
676} 724}
677 725
678static void set_spte_track_bits(u64 *sptep, u64 new_spte) 726static int set_spte_track_bits(u64 *sptep, u64 new_spte)
679{ 727{
680 pfn_t pfn; 728 pfn_t pfn;
681 u64 old_spte = *sptep; 729 u64 old_spte = *sptep;
682 730
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 731 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 732 __set_spte(sptep, new_spte);
686 } else 733 else
687 old_spte = __xchg_spte(sptep, new_spte); 734 old_spte = __xchg_spte(sptep, new_spte);
688 735
689 if (!is_rmap_spte(old_spte)) 736 if (!is_rmap_spte(old_spte))
690 return; 737 return 0;
738
691 pfn = spte_to_pfn(old_spte); 739 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 741 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 743 kvm_set_pfn_dirty(pfn);
744 return 1;
696} 745}
697 746
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{ 748{
700 set_spte_track_bits(sptep, new_spte); 749 if (set_spte_track_bits(sptep, new_spte))
701 rmap_remove(kvm, sptep); 750 rmap_remove(kvm, sptep);
702} 751}
703 752
704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
@@ -746,13 +795,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 795 }
747 spte = rmap_next(kvm, rmapp, spte); 796 spte = rmap_next(kvm, rmapp, spte);
748 } 797 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 798
757 /* check for huge page mappings */ 799 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 800 for (i = PT_DIRECTORY_LEVEL;
@@ -848,19 +890,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
848 end = start + (memslot->npages << PAGE_SHIFT); 890 end = start + (memslot->npages << PAGE_SHIFT);
849 if (hva >= start && hva < end) { 891 if (hva >= start && hva < end) {
850 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 892 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
893 gfn_t gfn = memslot->base_gfn + gfn_offset;
851 894
852 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 895 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
853 896
854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 897 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
855 unsigned long idx; 898 struct kvm_lpage_info *linfo;
856 int sh; 899
857 900 linfo = lpage_info_slot(gfn, memslot,
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 901 PT_DIRECTORY_LEVEL + j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 902 ret |= handler(kvm, &linfo->rmap_pde, data);
860 (memslot->base_gfn >> sh);
861 ret |= handler(kvm,
862 &memslot->lpage_info[j][idx].rmap_pde,
863 data);
864 } 903 }
865 trace_kvm_age_page(hva, memslot, ret); 904 trace_kvm_age_page(hva, memslot, ret);
866 retval |= ret; 905 retval |= ret;
@@ -911,6 +950,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
911 return young; 950 return young;
912} 951}
913 952
953static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
954 unsigned long data)
955{
956 u64 *spte;
957 int young = 0;
958
959 /*
960 * If there's no access bit in the secondary pte set by the
961 * hardware it's up to gup-fast/gup to set the access bit in
962 * the primary pte or in the page structure.
963 */
964 if (!shadow_accessed_mask)
965 goto out;
966
967 spte = rmap_next(kvm, rmapp, NULL);
968 while (spte) {
969 u64 _spte = *spte;
970 BUG_ON(!(_spte & PT_PRESENT_MASK));
971 young = _spte & PT_ACCESSED_MASK;
972 if (young) {
973 young = 1;
974 break;
975 }
976 spte = rmap_next(kvm, rmapp, spte);
977 }
978out:
979 return young;
980}
981
914#define RMAP_RECYCLE_THRESHOLD 1000 982#define RMAP_RECYCLE_THRESHOLD 1000
915 983
916static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 984static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -931,6 +999,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
931 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 999 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
932} 1000}
933 1001
1002int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1003{
1004 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1005}
1006
934#ifdef MMU_DEBUG 1007#ifdef MMU_DEBUG
935static int is_empty_shadow_page(u64 *spt) 1008static int is_empty_shadow_page(u64 *spt)
936{ 1009{
@@ -947,16 +1020,28 @@ static int is_empty_shadow_page(u64 *spt)
947} 1020}
948#endif 1021#endif
949 1022
1023/*
1024 * This value is the sum of all of the kvm instances's
1025 * kvm->arch.n_used_mmu_pages values. We need a global,
1026 * aggregate version in order to make the slab shrinker
1027 * faster
1028 */
1029static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1030{
1031 kvm->arch.n_used_mmu_pages += nr;
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033}
1034
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1036{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
954 list_del(&sp->link); 1039 list_del(&sp->link);
955 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
956 if (!sp->role.direct) 1041 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
958 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1044 kvm_mod_used_mmu_pages(kvm, -1);
960} 1045}
961 1046
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1047static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1065 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1066 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1068 return sp;
984} 1069}
985 1070
@@ -1110,7 +1195,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1110} 1195}
1111 1196
1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1113 struct kvm_mmu_page *sp, bool clear_unsync) 1198 struct kvm_mmu_page *sp)
1114{ 1199{
1115 return 1; 1200 return 1;
1116} 1201}
@@ -1119,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1119{ 1204{
1120} 1205}
1121 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte)
1210{
1211 WARN_ON(1);
1212}
1213
1122#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1123 1215
1124struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -1240,7 +1332,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1240 if (clear_unsync) 1332 if (clear_unsync)
1241 kvm_unlink_unsync_page(vcpu->kvm, sp); 1333 kvm_unlink_unsync_page(vcpu->kvm, sp);
1242 1334
1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1335 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1336 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1245 return 1; 1337 return 1;
1246 } 1338 }
@@ -1281,12 +1373,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1281 continue; 1373 continue;
1282 1374
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1375 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1376 kvm_unlink_unsync_page(vcpu->kvm, s);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1377 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1378 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1379 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue; 1380 continue;
1288 } 1381 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true; 1382 flush = true;
1291 } 1383 }
1292 1384
@@ -1403,7 +1495,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1495 if (role.direct)
1404 role.cr4_pae = 0; 1496 role.cr4_pae = 0;
1405 role.access = access; 1497 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1498 if (!vcpu->arch.mmu.direct_map
1499 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1500 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1501 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1502 role.quadrant = quadrant;
@@ -1458,6 +1551,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1551 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1552 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1553 iterator->level = vcpu->arch.mmu.shadow_root_level;
1554
1555 if (iterator->level == PT64_ROOT_LEVEL &&
1556 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1557 !vcpu->arch.mmu.direct_map)
1558 --iterator->level;
1559
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1560 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1561 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1562 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1764,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1764
1666/* 1765/*
1667 * Changing the number of mmu pages allocated to the vm 1766 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1767 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1768 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1769void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1770{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1771 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1772 /*
1679 * If we set the number of mmu pages to be smaller be than the 1773 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1774 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1775 * change the value
1682 */ 1776 */
1683 1777
1684 if (used_pages > kvm_nr_mmu_pages) { 1778 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1779 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1780 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1781 struct kvm_mmu_page *page;
1688 1782
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1783 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1784 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1787 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1789 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1790
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1791 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1792}
1704 1793
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1794static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1798,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1798 LIST_HEAD(invalid_list);
1710 int r; 1799 int r;
1711 1800
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1801 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1802 r = 0;
1714 1803
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1804 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1805 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1806 sp->role.word);
1718 r = 1; 1807 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1808 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1818,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1818 LIST_HEAD(invalid_list);
1730 1819
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1820 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1821 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1822 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1823 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1824 }
@@ -1915,9 +2004,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1915 unsigned pte_access, int user_fault, 2004 unsigned pte_access, int user_fault,
1916 int write_fault, int dirty, int level, 2005 int write_fault, int dirty, int level,
1917 gfn_t gfn, pfn_t pfn, bool speculative, 2006 gfn_t gfn, pfn_t pfn, bool speculative,
1918 bool can_unsync, bool reset_host_protection) 2007 bool can_unsync, bool host_writable)
1919{ 2008{
1920 u64 spte; 2009 u64 spte, entry = *sptep;
1921 int ret = 0; 2010 int ret = 0;
1922 2011
1923 /* 2012 /*
@@ -1925,7 +2014,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 2014 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 2015 * demand paging).
1927 */ 2016 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 2017 spte = PT_PRESENT_MASK;
1929 if (!speculative) 2018 if (!speculative)
1930 spte |= shadow_accessed_mask; 2019 spte |= shadow_accessed_mask;
1931 if (!dirty) 2020 if (!dirty)
@@ -1942,14 +2031,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1942 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2031 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1943 kvm_is_mmio_pfn(pfn)); 2032 kvm_is_mmio_pfn(pfn));
1944 2033
1945 if (reset_host_protection) 2034 if (host_writable)
1946 spte |= SPTE_HOST_WRITEABLE; 2035 spte |= SPTE_HOST_WRITEABLE;
2036 else
2037 pte_access &= ~ACC_WRITE_MASK;
1947 2038
1948 spte |= (u64)pfn << PAGE_SHIFT; 2039 spte |= (u64)pfn << PAGE_SHIFT;
1949 2040
1950 if ((pte_access & ACC_WRITE_MASK) 2041 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 2042 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 2043 && !is_write_protection(vcpu) && !user_fault)) {
1953 2044
1954 if (level > PT_PAGE_TABLE_LEVEL && 2045 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2046 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2051,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2051
1961 spte |= PT_WRITABLE_MASK; 2052 spte |= PT_WRITABLE_MASK;
1962 2053
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2054 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2056 spte &= ~PT_USER_MASK;
1965 2057
1966 /* 2058 /*
@@ -1973,7 +2065,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2065 goto set_pte;
1974 2066
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2067 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2068 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2069 __func__, gfn);
1978 ret = 1; 2070 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2071 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,9 +2078,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2078 mark_page_dirty(vcpu->kvm, gfn);
1987 2079
1988set_pte: 2080set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2081 update_spte(sptep, spte);
2082 /*
2083 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect
2085 * will find a read-only spte, even though the writable spte
2086 * might be cached on a CPU's TLB.
2087 */
2088 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2089 kvm_flush_remote_tlbs(vcpu->kvm);
1992done: 2090done:
1993 return ret; 2091 return ret;
1994} 2092}
@@ -1998,13 +2096,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1998 int user_fault, int write_fault, int dirty, 2096 int user_fault, int write_fault, int dirty,
1999 int *ptwrite, int level, gfn_t gfn, 2097 int *ptwrite, int level, gfn_t gfn,
2000 pfn_t pfn, bool speculative, 2098 pfn_t pfn, bool speculative,
2001 bool reset_host_protection) 2099 bool host_writable)
2002{ 2100{
2003 int was_rmapped = 0; 2101 int was_rmapped = 0;
2004 int rmap_count; 2102 int rmap_count;
2005 2103
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2104 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2105 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2106 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2107 write_fault, user_fault, gfn);
2010 2108
@@ -2023,7 +2121,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2122 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2123 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2124 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2125 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2127 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2033,14 +2131,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2033 2131
2034 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2035 dirty, level, gfn, pfn, speculative, true, 2133 dirty, level, gfn, pfn, speculative, true,
2036 reset_host_protection)) { 2134 host_writable)) {
2037 if (write_fault) 2135 if (write_fault)
2038 *ptwrite = 1; 2136 *ptwrite = 1;
2039 kvm_mmu_flush_tlb(vcpu); 2137 kvm_mmu_flush_tlb(vcpu);
2040 } 2138 }
2041 2139
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2142 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2143 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2144 *sptep, sptep);
@@ -2064,8 +2162,95 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2162{
2065} 2163}
2066 2164
2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2166 bool no_dirty_log)
2167{
2168 struct kvm_memory_slot *slot;
2169 unsigned long hva;
2170
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) {
2173 get_page(bad_page);
2174 return page_to_pfn(bad_page);
2175 }
2176
2177 hva = gfn_to_hva_memslot(slot, gfn);
2178
2179 return hva_to_pfn_atomic(vcpu->kvm, hva);
2180}
2181
2182static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2183 struct kvm_mmu_page *sp,
2184 u64 *start, u64 *end)
2185{
2186 struct page *pages[PTE_PREFETCH_NUM];
2187 unsigned access = sp->role.access;
2188 int i, ret;
2189 gfn_t gfn;
2190
2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2193 return -1;
2194
2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2196 if (ret <= 0)
2197 return -1;
2198
2199 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL,
2202 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true);
2204
2205 return 0;
2206}
2207
2208static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2209 struct kvm_mmu_page *sp, u64 *sptep)
2210{
2211 u64 *spte, *start = NULL;
2212 int i;
2213
2214 WARN_ON(!sp->role.direct);
2215
2216 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2217 spte = sp->spt + i;
2218
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2221 if (!start)
2222 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2224 break;
2225 start = NULL;
2226 } else if (!start)
2227 start = spte;
2228 }
2229}
2230
2231static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2232{
2233 struct kvm_mmu_page *sp;
2234
2235 /*
2236 * Since it's no accessed bit on EPT, it's no way to
2237 * distinguish between actually accessed translations
2238 * and prefetched, so disable pte prefetch if EPT is
2239 * enabled.
2240 */
2241 if (!shadow_accessed_mask)
2242 return;
2243
2244 sp = page_header(__pa(sptep));
2245 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2246 return;
2247
2248 __direct_pte_prefetch(vcpu, sp, sptep);
2249}
2250
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2251static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2252 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2253 bool prefault)
2069{ 2254{
2070 struct kvm_shadow_walk_iterator iterator; 2255 struct kvm_shadow_walk_iterator iterator;
2071 struct kvm_mmu_page *sp; 2256 struct kvm_mmu_page *sp;
@@ -2074,9 +2259,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2074 2259
2075 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2076 if (iterator.level == level) { 2261 if (iterator.level == level) {
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2262 unsigned pte_access = ACC_ALL;
2263
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2078 0, write, 1, &pt_write, 2265 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2266 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2268 ++vcpu->stat.pf_fixed;
2081 break; 2269 break;
2082 } 2270 }
@@ -2098,28 +2286,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2286 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2287 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2289 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask);
2102 } 2291 }
2103 } 2292 }
2104 return pt_write; 2293 return pt_write;
2105} 2294}
2106 2295
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2297{
2109 char buf[1]; 2298 siginfo_t info;
2110 void __user *hva;
2111 int r;
2112 2299
2113 /* Touch the page, so send SIGBUS */ 2300 info.si_signo = SIGBUS;
2114 hva = (void __user *)gfn_to_hva(kvm, gfn); 2301 info.si_errno = 0;
2115 r = copy_from_user(buf, hva, 1); 2302 info.si_code = BUS_MCEERR_AR;
2303 info.si_addr = (void __user *)address;
2304 info.si_addr_lsb = PAGE_SHIFT;
2305
2306 send_sig_info(SIGBUS, &info, tsk);
2116} 2307}
2117 2308
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2310{
2120 kvm_release_pfn_clean(pfn); 2311 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2312 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2314 return 0;
2124 } else if (is_fault_pfn(pfn)) 2315 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2316 return -EFAULT;
@@ -2127,27 +2318,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2127 return 1; 2318 return 1;
2128} 2319}
2129 2320
2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2322 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2323{
2324 pfn_t pfn = *pfnp;
2325 gfn_t gfn = *gfnp;
2326 int level = *levelp;
2327
2328 /*
2329 * Check if it's a transparent hugepage. If this would be an
2330 * hugetlbfs page, level wouldn't be set to
2331 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2332 * here.
2333 */
2334 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2335 level == PT_PAGE_TABLE_LEVEL &&
2336 PageTransCompound(pfn_to_page(pfn)) &&
2337 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2338 unsigned long mask;
2339 /*
2340 * mmu_notifier_retry was successful and we hold the
2341 * mmu_lock here, so the pmd can't become splitting
2342 * from under us, and in turn
2343 * __split_huge_page_refcount() can't run from under
2344 * us and we can safely transfer the refcount from
2345 * PG_tail to PG_head as we switch the pfn to tail to
2346 * head.
2347 */
2348 *levelp = level = PT_DIRECTORY_LEVEL;
2349 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2350 VM_BUG_ON((gfn & mask) != (pfn & mask));
2351 if (pfn & mask) {
2352 gfn &= ~mask;
2353 *gfnp = gfn;
2354 kvm_release_pfn_clean(pfn);
2355 pfn &= ~mask;
2356 if (!get_page_unless_zero(pfn_to_page(pfn)))
2357 BUG();
2358 *pfnp = pfn;
2359 }
2360 }
2361}
2362
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365
2366static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2367 bool prefault)
2131{ 2368{
2132 int r; 2369 int r;
2133 int level; 2370 int level;
2371 int force_pt_level;
2134 pfn_t pfn; 2372 pfn_t pfn;
2135 unsigned long mmu_seq; 2373 unsigned long mmu_seq;
2374 bool map_writable;
2136 2375
2137 level = mapping_level(vcpu, gfn); 2376 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2138 2377 if (likely(!force_pt_level)) {
2139 /* 2378 level = mapping_level(vcpu, gfn);
2140 * This path builds a PAE pagetable - so we can map 2mb pages at 2379 /*
2141 * maximum. Therefore check if the level is larger than that. 2380 * This path builds a PAE pagetable - so we can map
2142 */ 2381 * 2mb pages at maximum. Therefore check if the level
2143 if (level > PT_DIRECTORY_LEVEL) 2382 * is larger than that.
2144 level = PT_DIRECTORY_LEVEL; 2383 */
2384 if (level > PT_DIRECTORY_LEVEL)
2385 level = PT_DIRECTORY_LEVEL;
2145 2386
2146 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2387 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2388 } else
2389 level = PT_PAGE_TABLE_LEVEL;
2147 2390
2148 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2391 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2149 smp_rmb(); 2392 smp_rmb();
2150 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2393
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0;
2151 2396
2152 /* mmio */ 2397 /* mmio */
2153 if (is_error_pfn(pfn)) 2398 if (is_error_pfn(pfn))
@@ -2157,7 +2402,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2157 if (mmu_notifier_retry(vcpu, mmu_seq)) 2402 if (mmu_notifier_retry(vcpu, mmu_seq))
2158 goto out_unlock; 2403 goto out_unlock;
2159 kvm_mmu_free_some_pages(vcpu); 2404 kvm_mmu_free_some_pages(vcpu);
2160 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2405 if (likely(!force_pt_level))
2406 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2407 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2408 prefault);
2161 spin_unlock(&vcpu->kvm->mmu_lock); 2409 spin_unlock(&vcpu->kvm->mmu_lock);
2162 2410
2163 2411
@@ -2179,7 +2427,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2427 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2428 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2429 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2430 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2431 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2432 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2433 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2434
2185 sp = page_header(root); 2435 sp = page_header(root);
@@ -2222,83 +2472,163 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2472 return ret;
2223} 2473}
2224 2474
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2475static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2476{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2477 struct kvm_mmu_page *sp;
2230 int direct = 0; 2478 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2479
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2480 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2481 spin_lock(&vcpu->kvm->mmu_lock);
2482 kvm_mmu_free_some_pages(vcpu);
2483 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2484 1, ACC_ALL, NULL);
2485 ++sp->root_count;
2486 spin_unlock(&vcpu->kvm->mmu_lock);
2487 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2488 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2489 for (i = 0; i < 4; ++i) {
2490 hpa_t root = vcpu->arch.mmu.pae_root[i];
2491
2492 ASSERT(!VALID_PAGE(root));
2493 spin_lock(&vcpu->kvm->mmu_lock);
2494 kvm_mmu_free_some_pages(vcpu);
2495 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2496 i << 30,
2497 PT32_ROOT_LEVEL, 1, ACC_ALL,
2498 NULL);
2499 root = __pa(sp->spt);
2500 ++sp->root_count;
2501 spin_unlock(&vcpu->kvm->mmu_lock);
2502 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2503 }
2504 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2505 } else
2506 BUG();
2507
2508 return 0;
2509}
2510
2511static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2512{
2513 struct kvm_mmu_page *sp;
2514 u64 pdptr, pm_mask;
2515 gfn_t root_gfn;
2516 int i;
2517
2518 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2519
2520 if (mmu_check_root(vcpu, root_gfn))
2521 return 1;
2522
2523 /*
2524 * Do we shadow a long mode page table? If so we need to
2525 * write-protect the guests page table root.
2526 */
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2529
2238 ASSERT(!VALID_PAGE(root)); 2530 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2531
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2532 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2533 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2534 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2535 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2536 root = __pa(sp->spt);
2251 ++sp->root_count; 2537 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2538 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2539 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2540 return 0;
2255 } 2541 }
2256 direct = !is_paging(vcpu); 2542
2543 /*
2544 * We shadow a 32 bit page table. This may be a legacy 2-level
2545 * or a PAE 3-level page table. In either case we need to be aware that
2546 * the shadow page table may be a PAE or a long mode page table.
2547 */
2548 pm_mask = PT_PRESENT_MASK;
2549 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2550 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2551
2257 for (i = 0; i < 4; ++i) { 2552 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2553 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2554
2260 ASSERT(!VALID_PAGE(root)); 2555 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2556 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2557 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2558 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2559 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2560 continue;
2266 } 2561 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2562 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2563 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2564 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2565 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2566 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2567 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2568 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2569 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2570 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2571 root = __pa(sp->spt);
2282 ++sp->root_count; 2572 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2573 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2574
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2575 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2576 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2577 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2578
2579 /*
2580 * If we shadow a 32 bit page table with a long mode page
2581 * table we enter this path.
2582 */
2583 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2584 if (vcpu->arch.mmu.lm_root == NULL) {
2585 /*
2586 * The additional page necessary for this is only
2587 * allocated on demand.
2588 */
2589
2590 u64 *lm_root;
2591
2592 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2593 if (lm_root == NULL)
2594 return 1;
2595
2596 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2597
2598 vcpu->arch.mmu.lm_root = lm_root;
2599 }
2600
2601 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2602 }
2603
2288 return 0; 2604 return 0;
2289} 2605}
2290 2606
2607static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2608{
2609 if (vcpu->arch.mmu.direct_map)
2610 return mmu_alloc_direct_roots(vcpu);
2611 else
2612 return mmu_alloc_shadow_roots(vcpu);
2613}
2614
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2615static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2616{
2293 int i; 2617 int i;
2294 struct kvm_mmu_page *sp; 2618 struct kvm_mmu_page *sp;
2295 2619
2620 if (vcpu->arch.mmu.direct_map)
2621 return;
2622
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2624 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2625
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2628 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2629 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2630 mmu_sync_children(vcpu, sp);
2631 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2302 return; 2632 return;
2303 } 2633 }
2304 for (i = 0; i < 4; ++i) { 2634 for (i = 0; i < 4; ++i) {
@@ -2310,6 +2640,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2640 mmu_sync_children(vcpu, sp);
2311 } 2641 }
2312 } 2642 }
2643 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2644}
2314 2645
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2646void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2320,15 +2651,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2320} 2651}
2321 2652
2322static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2653static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2323 u32 access, u32 *error) 2654 u32 access, struct x86_exception *exception)
2324{ 2655{
2325 if (error) 2656 if (exception)
2326 *error = 0; 2657 exception->error_code = 0;
2327 return vaddr; 2658 return vaddr;
2328} 2659}
2329 2660
2661static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2662 u32 access,
2663 struct x86_exception *exception)
2664{
2665 if (exception)
2666 exception->error_code = 0;
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668}
2669
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2671 u32 error_code, bool prefault)
2332{ 2672{
2333 gfn_t gfn; 2673 gfn_t gfn;
2334 int r; 2674 int r;
@@ -2344,17 +2684,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2344 gfn = gva >> PAGE_SHIFT; 2684 gfn = gva >> PAGE_SHIFT;
2345 2685
2346 return nonpaging_map(vcpu, gva & PAGE_MASK, 2686 return nonpaging_map(vcpu, gva & PAGE_MASK,
2347 error_code & PFERR_WRITE_MASK, gfn); 2687 error_code & PFERR_WRITE_MASK, gfn, prefault);
2688}
2689
2690static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2691{
2692 struct kvm_arch_async_pf arch;
2693
2694 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2695 arch.gfn = gfn;
2696 arch.direct_map = vcpu->arch.mmu.direct_map;
2697 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2698
2699 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2700}
2701
2702static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2703{
2704 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2705 kvm_event_needs_reinjection(vcpu)))
2706 return false;
2707
2708 return kvm_x86_ops->interrupt_allowed(vcpu);
2709}
2710
2711static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2712 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2713{
2714 bool async;
2715
2716 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2717
2718 if (!async)
2719 return false; /* *pfn has correct page already */
2720
2721 put_page(pfn_to_page(*pfn));
2722
2723 if (!prefault && can_do_async_pf(vcpu)) {
2724 trace_kvm_try_async_get_page(gva, gfn);
2725 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2726 trace_kvm_async_pf_doublefault(gva, gfn);
2727 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2728 return true;
2729 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2730 return true;
2731 }
2732
2733 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734
2735 return false;
2348} 2736}
2349 2737
2350static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2738static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2351 u32 error_code) 2739 bool prefault)
2352{ 2740{
2353 pfn_t pfn; 2741 pfn_t pfn;
2354 int r; 2742 int r;
2355 int level; 2743 int level;
2744 int force_pt_level;
2356 gfn_t gfn = gpa >> PAGE_SHIFT; 2745 gfn_t gfn = gpa >> PAGE_SHIFT;
2357 unsigned long mmu_seq; 2746 unsigned long mmu_seq;
2747 int write = error_code & PFERR_WRITE_MASK;
2748 bool map_writable;
2358 2749
2359 ASSERT(vcpu); 2750 ASSERT(vcpu);
2360 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2363,21 +2754,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2363 if (r) 2754 if (r)
2364 return r; 2755 return r;
2365 2756
2366 level = mapping_level(vcpu, gfn); 2757 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2367 2758 if (likely(!force_pt_level)) {
2368 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2759 level = mapping_level(vcpu, gfn);
2760 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2761 } else
2762 level = PT_PAGE_TABLE_LEVEL;
2369 2763
2370 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2764 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2371 smp_rmb(); 2765 smp_rmb();
2372 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2766
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0;
2769
2770 /* mmio */
2373 if (is_error_pfn(pfn)) 2771 if (is_error_pfn(pfn))
2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2375 spin_lock(&vcpu->kvm->mmu_lock); 2773 spin_lock(&vcpu->kvm->mmu_lock);
2376 if (mmu_notifier_retry(vcpu, mmu_seq)) 2774 if (mmu_notifier_retry(vcpu, mmu_seq))
2377 goto out_unlock; 2775 goto out_unlock;
2378 kvm_mmu_free_some_pages(vcpu); 2776 kvm_mmu_free_some_pages(vcpu);
2379 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2777 if (likely(!force_pt_level))
2380 level, gfn, pfn); 2778 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2779 r = __direct_map(vcpu, gpa, write, map_writable,
2780 level, gfn, pfn, prefault);
2381 spin_unlock(&vcpu->kvm->mmu_lock); 2781 spin_unlock(&vcpu->kvm->mmu_lock);
2382 2782
2383 return r; 2783 return r;
@@ -2393,10 +2793,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2793 mmu_free_roots(vcpu);
2394} 2794}
2395 2795
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2796static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2797 struct kvm_mmu *context)
2397{ 2798{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2799 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2800 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2801 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2404,9 +2803,12 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2404 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2405 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2406 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2407 context->root_level = 0; 2807 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
2810 context->direct_map = true;
2811 context->nx = false;
2410 return 0; 2812 return 0;
2411} 2813}
2412 2814
@@ -2418,15 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2418 2820
2419static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2420{ 2822{
2421 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2422 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2423} 2825}
2424 2826
2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2828{
2829 return kvm_read_cr3(vcpu);
2830}
2831
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2426 u64 addr, 2833 struct x86_exception *fault)
2427 u32 err_code)
2428{ 2834{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2430} 2836}
2431 2837
2432static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2840,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2840 nonpaging_free(vcpu);
2435} 2841}
2436 2842
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2843static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2844{
2439 int bit7; 2845 int bit7;
2440 2846
2441 bit7 = (gpte >> 7) & 1; 2847 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2849}
2444 2850
2445#define PTTYPE 64 2851#define PTTYPE 64
@@ -2450,13 +2856,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2856#include "paging_tmpl.h"
2451#undef PTTYPE 2857#undef PTTYPE
2452 2858
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2859static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2860 struct kvm_mmu *context,
2861 int level)
2454{ 2862{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2863 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2864 u64 exb_bit_rsvd = 0;
2458 2865
2459 if (!is_nx(vcpu)) 2866 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2867 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2868 switch (level) {
2462 case PT32_ROOT_LEVEL: 2869 case PT32_ROOT_LEVEL:
@@ -2511,9 +2918,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2918 }
2512} 2919}
2513 2920
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2921static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2922 struct kvm_mmu *context,
2923 int level)
2515{ 2924{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2925 context->nx = is_nx(vcpu);
2926
2927 reset_rsvds_bits_mask(vcpu, context, level);
2517 2928
2518 ASSERT(is_pae(vcpu)); 2929 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2930 context->new_cr3 = paging_new_cr3;
@@ -2522,24 +2933,28 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2522 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2523 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2524 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2525 context->free = paging_free; 2937 context->free = paging_free;
2526 context->root_level = level; 2938 context->root_level = level;
2527 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2940 context->root_hpa = INVALID_PAGE;
2941 context->direct_map = false;
2529 return 0; 2942 return 0;
2530} 2943}
2531 2944
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2945static int paging64_init_context(struct kvm_vcpu *vcpu,
2946 struct kvm_mmu *context)
2533{ 2947{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2948 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2949}
2537 2950
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2951static int paging32_init_context(struct kvm_vcpu *vcpu,
2952 struct kvm_mmu *context)
2539{ 2953{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2954 context->nx = false;
2955
2956 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2957
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2958 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2959 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2960 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2547,44 +2962,57 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2547 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2548 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2549 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2550 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
2969 context->direct_map = false;
2553 return 0; 2970 return 0;
2554} 2971}
2555 2972
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2973static int paging32E_init_context(struct kvm_vcpu *vcpu,
2974 struct kvm_mmu *context)
2557{ 2975{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2976 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2977}
2561 2978
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2979static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2980{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2981 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2982
2983 context->base_role.word = 0;
2566 context->new_cr3 = nonpaging_new_cr3; 2984 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2985 context->page_fault = tdp_page_fault;
2568 context->free = nonpaging_free; 2986 context->free = nonpaging_free;
2569 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2570 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2571 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2993 context->direct_map = true;
2994 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2995 context->get_cr3 = get_cr3;
2996 context->inject_page_fault = kvm_inject_page_fault;
2997 context->nx = is_nx(vcpu);
2574 2998
2575 if (!is_paging(vcpu)) { 2999 if (!is_paging(vcpu)) {
3000 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 3001 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 3002 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 3003 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 3004 context->nx = is_nx(vcpu);
3005 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 3006 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 3007 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 3008 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 3009 context->nx = is_nx(vcpu);
3010 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 3011 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 3012 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 3013 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 3014 context->nx = false;
3015 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 3016 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 3017 context->root_level = PT32_ROOT_LEVEL;
2590 } 3018 }
@@ -2592,33 +3020,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 3020 return 0;
2593} 3021}
2594 3022
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 3024{
2597 int r; 3025 int r;
2598
2599 ASSERT(vcpu); 3026 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 3028
2602 if (!is_paging(vcpu)) 3029 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 3030 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 3031 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 3032 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 3033 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 3034 r = paging32E_init_context(vcpu, context);
2608 else 3035 else
2609 r = paging32_init_context(vcpu); 3036 r = paging32_init_context(vcpu, context);
2610 3037
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 3040
2614 return r; 3041 return r;
2615} 3042}
3043EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2616 3044
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3045static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2618{ 3046{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 3047 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2620 3048
2621 if (tdp_enabled) 3049 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3050 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3051 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3052
3053 return r;
3054}
3055
3056static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3057{
3058 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3059
3060 g_context->get_cr3 = get_cr3;
3061 g_context->inject_page_fault = kvm_inject_page_fault;
3062
3063 /*
3064 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3065 * translation of l2_gpa to l1_gpa addresses is done using the
3066 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3067 * functions between mmu and nested_mmu are swapped.
3068 */
3069 if (!is_paging(vcpu)) {
3070 g_context->nx = false;
3071 g_context->root_level = 0;
3072 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3073 } else if (is_long_mode(vcpu)) {
3074 g_context->nx = is_nx(vcpu);
3075 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3076 g_context->root_level = PT64_ROOT_LEVEL;
3077 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3078 } else if (is_pae(vcpu)) {
3079 g_context->nx = is_nx(vcpu);
3080 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3081 g_context->root_level = PT32E_ROOT_LEVEL;
3082 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3083 } else {
3084 g_context->nx = false;
3085 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3086 g_context->root_level = PT32_ROOT_LEVEL;
3087 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3088 }
3089
3090 return 0;
3091}
3092
3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3094{
3095 if (mmu_is_nested(vcpu))
3096 return init_kvm_nested_mmu(vcpu);
3097 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 3098 return init_kvm_tdp_mmu(vcpu);
2623 else 3099 else
2624 return init_kvm_softmmu(vcpu); 3100 return init_kvm_softmmu(vcpu);
@@ -2653,7 +3129,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 3129 if (r)
2654 goto out; 3130 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 3131 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 3132 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 3133out:
2658 return r; 3134 return r;
2659} 3135}
@@ -2663,6 +3139,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 3139{
2664 mmu_free_roots(vcpu); 3140 mmu_free_roots(vcpu);
2665} 3141}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 3143
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 3145 struct kvm_mmu_page *sp,
@@ -2686,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2686} 3163}
2687 3164
2688static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2689 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp, u64 *spte,
2690 u64 *spte,
2691 const void *new) 3167 const void *new)
2692{ 3168{
2693 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
@@ -2695,14 +3171,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3171 return;
2696 } 3172 }
2697 3173
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
2702 if (!sp->role.cr4_pae) 3175 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
2703 paging32_update_pte(vcpu, sp, spte, new);
2704 else
2705 paging64_update_pte(vcpu, sp, spte, new);
2706} 3176}
2707 3177
2708static bool need_remote_flush(u64 old, u64 new) 3178static bool need_remote_flush(u64 old, u64 new)
@@ -2737,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2737 return !!(spte && (*spte & shadow_accessed_mask)); 3207 return !!(spte && (*spte & shadow_accessed_mask));
2738} 3208}
2739 3209
2740static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2741 u64 gpte)
2742{
2743 gfn_t gfn;
2744 pfn_t pfn;
2745
2746 if (!is_present_gpte(gpte))
2747 return;
2748 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2749
2750 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2751 smp_rmb();
2752 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2753
2754 if (is_error_pfn(pfn)) {
2755 kvm_release_pfn_clean(pfn);
2756 return;
2757 }
2758 vcpu->arch.update_pte.gfn = gfn;
2759 vcpu->arch.update_pte.pfn = pfn;
2760}
2761
2762static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3210static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2763{ 3211{
2764 u64 *spte = vcpu->arch.last_pte_updated; 3212 u64 *spte = vcpu->arch.last_pte_updated;
@@ -2780,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2780 struct kvm_mmu_page *sp; 3228 struct kvm_mmu_page *sp;
2781 struct hlist_node *node; 3229 struct hlist_node *node;
2782 LIST_HEAD(invalid_list); 3230 LIST_HEAD(invalid_list);
2783 u64 entry, gentry; 3231 u64 entry, gentry, *spte;
2784 u64 *spte; 3232 unsigned pte_size, page_offset, misaligned, quadrant, offset;
2785 unsigned offset = offset_in_page(gpa); 3233 int level, npte, invlpg_counter, r, flooded = 0;
2786 unsigned pte_size;
2787 unsigned page_offset;
2788 unsigned misaligned;
2789 unsigned quadrant;
2790 int level;
2791 int flooded = 0;
2792 int npte;
2793 int r;
2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page; 3234 bool remote_flush, local_flush, zap_page;
2796 3235
2797 zap_page = remote_flush = local_flush = false; 3236 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa);
2798 3238
2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3239 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2800 3240
@@ -2802,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2802 3242
2803 /* 3243 /*
2804 * Assume that the pte write on a page table of the same type 3244 * Assume that the pte write on a page table of the same type
2805 * as the current vcpu paging mode. This is nearly always true 3245 * as the current vcpu paging mode since we update the sptes only
2806 * (might be false while changing modes). Note it is verified later 3246 * when they have the same mode.
2807 * by update_pte().
2808 */ 3247 */
2809 if ((is_pae(vcpu) && bytes == 4) || !new) { 3248 if ((is_pae(vcpu) && bytes == 4) || !new) {
2810 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3249 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -2830,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2830 break; 3269 break;
2831 } 3270 }
2832 3271
2833 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2834 spin_lock(&vcpu->kvm->mmu_lock); 3272 spin_lock(&vcpu->kvm->mmu_lock);
2835 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3273 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2836 gentry = 0; 3274 gentry = 0;
2837 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3275 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3276 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3277 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3278 if (guest_initiated) {
3279 kvm_mmu_access_page(vcpu, gfn);
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3280 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3281 && !last_updated_pte_accessed(vcpu)) {
2844 ++vcpu->arch.last_pt_write_count; 3282 ++vcpu->arch.last_pt_write_count;
@@ -2910,12 +3348,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3348 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3349 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3350 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3351 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3352 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2917 vcpu->arch.update_pte.pfn = bad_pfn;
2918 }
2919} 3353}
2920 3354
2921int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3355int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -2923,7 +3357,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3357 gpa_t gpa;
2924 int r; 3358 int r;
2925 3359
2926 if (tdp_enabled) 3360 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3361 return 0;
2928 3362
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3363 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,29 +3371,27 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3371
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3372void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3373{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3374 LIST_HEAD(invalid_list);
2942 3375
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3376 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3377 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3378 struct kvm_mmu_page *sp;
2947 3379
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3381 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3384 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3385 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3386}
2956 3387
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3389 void *insn, int insn_len)
2958{ 3390{
2959 int r; 3391 int r;
2960 enum emulation_result er; 3392 enum emulation_result er;
2961 3393
2962 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3394 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
2963 if (r < 0) 3395 if (r < 0)
2964 goto out; 3396 goto out;
2965 3397
@@ -2972,7 +3404,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2972 if (r) 3404 if (r)
2973 goto out; 3405 goto out;
2974 3406
2975 er = emulate_instruction(vcpu, cr2, error_code, 0); 3407 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
2976 3408
2977 switch (er) { 3409 switch (er) {
2978 case EMULATE_DONE: 3410 case EMULATE_DONE:
@@ -3013,6 +3445,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3445static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3446{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3447 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3448 if (vcpu->arch.mmu.lm_root != NULL)
3449 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3450}
3017 3451
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3452static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3488,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3488 return init_kvm_mmu(vcpu);
3055} 3489}
3056 3490
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3491void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3492{
3068 struct kvm_mmu_page *sp; 3493 struct kvm_mmu_page *sp;
@@ -3075,10 +3500,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3075 continue; 3500 continue;
3076 3501
3077 pt = sp->spt; 3502 pt = sp->spt;
3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3503 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3504 if (!is_shadow_present_pte(pt[i]) ||
3505 !is_last_spte(pt[i], sp->role.level))
3506 continue;
3507
3508 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i],
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages;
3512 continue;
3513 }
3514
3079 /* avoid RMW */ 3515 /* avoid RMW */
3080 if (is_writable_pte(pt[i])) 3516 if (is_writable_pte(pt[i]))
3081 pt[i] &= ~PT_WRITABLE_MASK; 3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3518 }
3082 } 3519 }
3083 kvm_flush_remote_tlbs(kvm); 3520 kvm_flush_remote_tlbs(kvm);
3084} 3521}
@@ -3108,27 +3545,27 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3545 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3109} 3546}
3110 3547
3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3548static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3112{ 3549{
3113 struct kvm *kvm; 3550 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3551 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3552 int nr_to_scan = sc->nr_to_scan;
3553
3554 if (nr_to_scan == 0)
3555 goto out;
3116 3556
3117 spin_lock(&kvm_lock); 3557 raw_spin_lock(&kvm_lock);
3118 3558
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3559 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3560 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3561 LIST_HEAD(invalid_list);
3122 3562
3123 idx = srcu_read_lock(&kvm->srcu); 3563 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3564 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3565 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3566 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3567 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3568 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3569 kvm_freed = kvm;
3133 } 3570 }
3134 nr_to_scan--; 3571 nr_to_scan--;
@@ -3140,9 +3577,10 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3140 if (kvm_freed) 3577 if (kvm_freed)
3141 list_move_tail(&kvm_freed->vm_list, &vm_list); 3578 list_move_tail(&kvm_freed->vm_list, &vm_list);
3142 3579
3143 spin_unlock(&kvm_lock); 3580 raw_spin_unlock(&kvm_lock);
3144 3581
3145 return cache_count; 3582out:
3583 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3584}
3147 3585
3148static struct shrinker mmu_shrinker = { 3586static struct shrinker mmu_shrinker = {
@@ -3160,12 +3598,6 @@ static void mmu_destroy_caches(void)
3160 kmem_cache_destroy(mmu_page_header_cache); 3598 kmem_cache_destroy(mmu_page_header_cache);
3161} 3599}
3162 3600
3163void kvm_mmu_module_exit(void)
3164{
3165 mmu_destroy_caches();
3166 unregister_shrinker(&mmu_shrinker);
3167}
3168
3169int kvm_mmu_module_init(void) 3601int kvm_mmu_module_init(void)
3170{ 3602{
3171 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3185,6 +3617,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3617 if (!mmu_page_header_cache)
3186 goto nomem; 3618 goto nomem;
3187 3619
3620 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3621 goto nomem;
3622
3188 register_shrinker(&mmu_shrinker); 3623 register_shrinker(&mmu_shrinker);
3189 3624
3190 return 0; 3625 return 0;
@@ -3259,7 +3694,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3259 3694
3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3695static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3261{ 3696{
3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3697 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3263 return 1; 3698 return 1;
3264} 3699}
3265 3700
@@ -3355,271 +3790,25 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3790}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3791EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3792
3358#ifdef AUDIT 3793void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3359
3360static const char *audit_msg;
3361
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438
3439 if (is_error_pfn(pfn)) {
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{ 3794{
3554 struct kvm_mmu_page *sp; 3795 ASSERT(vcpu);
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575 3796
3576static void audit_rmap(struct kvm_vcpu *vcpu) 3797 destroy_kvm_mmu(vcpu);
3577{ 3798 free_mmu_pages(vcpu);
3578 check_writable_mappings_rmap(vcpu); 3799 mmu_free_memory_caches(vcpu);
3579 count_rmaps(vcpu);
3580} 3800}
3581 3801
3582static void audit_write_protection(struct kvm_vcpu *vcpu) 3802#ifdef CONFIG_KVM_MMU_AUDIT
3583{ 3803#include "mmu_audit.c"
3584 struct kvm_mmu_page *sp; 3804#else
3585 struct kvm_memory_slot *slot; 3805static void mmu_audit_disable(void) { }
3586 unsigned long *rmapp; 3806#endif
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610 3807
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) 3808void kvm_mmu_module_exit(void)
3612{ 3809{
3613 int olddbg = dbg; 3810 mmu_destroy_caches();
3614 3811 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3615 dbg = 0; 3812 unregister_shrinker(&mmu_shrinker);
3616 audit_msg = msg; 3813 mmu_audit_disable();
3617 audit_rmap(vcpu);
3618 audit_write_protection(vcpu);
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3814}
3624
3625#endif