aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c918
1 files changed, 485 insertions, 433 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..908ea5464a51 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -49,15 +49,25 @@
49 */ 49 */
50bool tdp_enabled = false; 50bool tdp_enabled = false;
51 51
52#undef MMU_DEBUG 52enum {
53 AUDIT_PRE_PAGE_FAULT,
54 AUDIT_POST_PAGE_FAULT,
55 AUDIT_PRE_PTE_WRITE,
56 AUDIT_POST_PTE_WRITE,
57 AUDIT_PRE_SYNC,
58 AUDIT_POST_SYNC
59};
53 60
54#undef AUDIT 61char *audit_point_name[] = {
62 "pre page fault",
63 "post page fault",
64 "pre pte write",
65 "post pte write",
66 "pre sync",
67 "post sync"
68};
55 69
56#ifdef AUDIT 70#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 71
62#ifdef MMU_DEBUG 72#ifdef MMU_DEBUG
63 73
@@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 81
72#endif 82#endif
73 83
74#if defined(MMU_DEBUG) || defined(AUDIT) 84#ifdef MMU_DEBUG
75static int dbg = 0; 85static int dbg = 0;
76module_param(dbg, bool, 0644); 86module_param(dbg, bool, 0644);
77#endif 87#endif
@@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644);
89 } 99 }
90#endif 100#endif
91 101
102#define PTE_PREFETCH_NUM 8
103
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 104#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 105#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 106
@@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 190static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 191static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 192static struct kmem_cache *mmu_page_header_cache;
193static struct percpu_counter kvm_total_used_mmu_pages;
181 194
182static u64 __read_mostly shadow_trap_nonpresent_pte; 195static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 196static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 312#endif
300} 313}
301 314
315static bool spte_has_volatile_bits(u64 spte)
316{
317 if (!shadow_accessed_mask)
318 return false;
319
320 if (!is_shadow_present_pte(spte))
321 return false;
322
323 if ((spte & shadow_accessed_mask) &&
324 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
325 return false;
326
327 return true;
328}
329
330static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
331{
332 return (old_spte & bit_mask) && !(new_spte & bit_mask);
333}
334
302static void update_spte(u64 *sptep, u64 new_spte) 335static void update_spte(u64 *sptep, u64 new_spte)
303{ 336{
304 u64 old_spte; 337 u64 mask, old_spte = *sptep;
338
339 WARN_ON(!is_rmap_spte(new_spte));
340
341 new_spte |= old_spte & shadow_dirty_mask;
305 342
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 343 mask = shadow_accessed_mask;
307 !is_rmap_spte(*sptep)) 344 if (is_writable_pte(old_spte))
345 mask |= shadow_dirty_mask;
346
347 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
308 __set_spte(sptep, new_spte); 348 __set_spte(sptep, new_spte);
309 else { 349 else
310 old_spte = __xchg_spte(sptep, new_spte); 350 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 351
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 352 if (!shadow_accessed_mask)
313 } 353 return;
354
355 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
356 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
357 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
358 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 359}
315 360
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 361static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 412 if (r)
368 goto out; 413 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 414 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 415 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 416 if (r)
372 goto out; 417 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 418 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 636 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 637 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 638 *rmapp = (unsigned long)desc | 1;
639 ++count;
594 } else { 640 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 641 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 642 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 649 desc = desc->more;
604 } 650 }
605 for (i = 0; desc->sptes[i]; ++i) 651 for (i = 0; desc->sptes[i]; ++i)
606 ; 652 ++count;
607 desc->sptes[i] = spte; 653 desc->sptes[i] = spte;
608 } 654 }
609 return count; 655 return count;
@@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 691 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 692 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 693 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 694 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 695 BUG();
650 } else if (!(*rmapp & 1)) { 696 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 697 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 698 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 699 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 700 BUG();
656 } 701 }
657 *rmapp = 0; 702 *rmapp = 0;
658 } else { 703 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 704 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 705 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 706 prev_desc = NULL;
662 while (desc) { 707 while (desc) {
@@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 715 prev_desc = desc;
671 desc = desc->more; 716 desc = desc->more;
672 } 717 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 718 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 719 BUG();
675 } 720 }
676} 721}
@@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte)
680 pfn_t pfn; 725 pfn_t pfn;
681 u64 old_spte = *sptep; 726 u64 old_spte = *sptep;
682 727
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 728 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 729 __set_spte(sptep, new_spte);
686 } else 730 else
687 old_spte = __xchg_spte(sptep, new_spte); 731 old_spte = __xchg_spte(sptep, new_spte);
688 732
689 if (!is_rmap_spte(old_spte)) 733 if (!is_rmap_spte(old_spte))
690 return; 734 return;
735
691 pfn = spte_to_pfn(old_spte); 736 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 737 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 738 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 739 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 740 kvm_set_pfn_dirty(pfn);
696} 741}
697 742
@@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 791 }
747 spte = rmap_next(kvm, rmapp, spte); 792 spte = rmap_next(kvm, rmapp, spte);
748 } 793 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 794
757 /* check for huge page mappings */ 795 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 796 for (i = PT_DIRECTORY_LEVEL;
@@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt)
947} 985}
948#endif 986#endif
949 987
988/*
989 * This value is the sum of all of the kvm instances's
990 * kvm->arch.n_used_mmu_pages values. We need a global,
991 * aggregate version in order to make the slab shrinker
992 * faster
993 */
994static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
995{
996 kvm->arch.n_used_mmu_pages += nr;
997 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
998}
999
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1000static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1001{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1002 ASSERT(is_empty_shadow_page(sp->spt));
@@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
956 if (!sp->role.direct) 1006 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1007 __free_page(virt_to_page(sp->gfns));
958 kmem_cache_free(mmu_page_header_cache, sp); 1008 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1009 kvm_mod_used_mmu_pages(kvm, -1);
960} 1010}
961 1011
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1012static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1029 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1030 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1031 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1032 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1033 return sp;
984} 1034}
985 1035
@@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1453 if (role.direct)
1404 role.cr4_pae = 0; 1454 role.cr4_pae = 0;
1405 role.access = access; 1455 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1456 if (!vcpu->arch.mmu.direct_map
1457 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1458 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1459 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1460 role.quadrant = quadrant;
@@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1509 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1510 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1511 iterator->level = vcpu->arch.mmu.shadow_root_level;
1512
1513 if (iterator->level == PT64_ROOT_LEVEL &&
1514 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1515 !vcpu->arch.mmu.direct_map)
1516 --iterator->level;
1517
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1518 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1519 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1520 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1722
1666/* 1723/*
1667 * Changing the number of mmu pages allocated to the vm 1724 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1725 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1726 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1727void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1728{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1729 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1730 /*
1679 * If we set the number of mmu pages to be smaller be than the 1731 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1732 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1733 * change the value
1682 */ 1734 */
1683 1735
1684 if (used_pages > kvm_nr_mmu_pages) { 1736 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1737 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1738 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1739 struct kvm_mmu_page *page;
1688 1740
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1741 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1742 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1743 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1744 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1745 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1746 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1747 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1748
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1749 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1750}
1704 1751
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1752static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1756 LIST_HEAD(invalid_list);
1710 int r; 1757 int r;
1711 1758
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1759 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1760 r = 0;
1714 1761
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1762 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1763 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1764 sp->role.word);
1718 r = 1; 1765 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1766 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1776 LIST_HEAD(invalid_list);
1730 1777
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1778 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1779 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1780 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1781 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1782 }
@@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 1972 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 1973 * demand paging).
1927 */ 1974 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 1975 spte = shadow_base_present_pte;
1929 if (!speculative) 1976 if (!speculative)
1930 spte |= shadow_accessed_mask; 1977 spte |= shadow_accessed_mask;
1931 if (!dirty) 1978 if (!dirty)
@@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1948 spte |= (u64)pfn << PAGE_SHIFT; 1995 spte |= (u64)pfn << PAGE_SHIFT;
1949 1996
1950 if ((pte_access & ACC_WRITE_MASK) 1997 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1998 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 1999 && !is_write_protection(vcpu) && !user_fault)) {
1953 2000
1954 if (level > PT_PAGE_TABLE_LEVEL && 2001 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2002 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2007
1961 spte |= PT_WRITABLE_MASK; 2008 spte |= PT_WRITABLE_MASK;
1962 2009
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2010 if (!vcpu->arch.mmu.direct_map
2011 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2012 spte &= ~PT_USER_MASK;
1965 2013
1966 /* 2014 /*
@@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2021 goto set_pte;
1974 2022
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2023 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2024 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2025 __func__, gfn);
1978 ret = 1; 2026 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2027 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2034 mark_page_dirty(vcpu->kvm, gfn);
1987 2035
1988set_pte: 2036set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2037 update_spte(sptep, spte);
1992done: 2038done:
1993 return ret; 2039 return ret;
@@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2004 int rmap_count; 2050 int rmap_count;
2005 2051
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2052 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2053 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2054 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2055 write_fault, user_fault, gfn);
2010 2056
@@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2069 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2070 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2071 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2072 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2073 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2074 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2075 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2040 } 2086 }
2041 2087
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2088 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2089 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2090 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2091 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2092 *sptep, sptep);
@@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2110{
2065} 2111}
2066 2112
2113static struct kvm_memory_slot *
2114pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
2115{
2116 struct kvm_memory_slot *slot;
2117
2118 slot = gfn_to_memslot(vcpu->kvm, gfn);
2119 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
2120 (no_dirty_log && slot->dirty_bitmap))
2121 slot = NULL;
2122
2123 return slot;
2124}
2125
2126static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2127 bool no_dirty_log)
2128{
2129 struct kvm_memory_slot *slot;
2130 unsigned long hva;
2131
2132 slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
2133 if (!slot) {
2134 get_page(bad_page);
2135 return page_to_pfn(bad_page);
2136 }
2137
2138 hva = gfn_to_hva_memslot(slot, gfn);
2139
2140 return hva_to_pfn_atomic(vcpu->kvm, hva);
2141}
2142
2143static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2144 struct kvm_mmu_page *sp,
2145 u64 *start, u64 *end)
2146{
2147 struct page *pages[PTE_PREFETCH_NUM];
2148 unsigned access = sp->role.access;
2149 int i, ret;
2150 gfn_t gfn;
2151
2152 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2153 if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK))
2154 return -1;
2155
2156 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2157 if (ret <= 0)
2158 return -1;
2159
2160 for (i = 0; i < ret; i++, gfn++, start++)
2161 mmu_set_spte(vcpu, start, ACC_ALL,
2162 access, 0, 0, 1, NULL,
2163 sp->role.level, gfn,
2164 page_to_pfn(pages[i]), true, true);
2165
2166 return 0;
2167}
2168
2169static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2170 struct kvm_mmu_page *sp, u64 *sptep)
2171{
2172 u64 *spte, *start = NULL;
2173 int i;
2174
2175 WARN_ON(!sp->role.direct);
2176
2177 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2178 spte = sp->spt + i;
2179
2180 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2181 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2182 if (!start)
2183 continue;
2184 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2185 break;
2186 start = NULL;
2187 } else if (!start)
2188 start = spte;
2189 }
2190}
2191
2192static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2193{
2194 struct kvm_mmu_page *sp;
2195
2196 /*
2197 * Since it's no accessed bit on EPT, it's no way to
2198 * distinguish between actually accessed translations
2199 * and prefetched, so disable pte prefetch if EPT is
2200 * enabled.
2201 */
2202 if (!shadow_accessed_mask)
2203 return;
2204
2205 sp = page_header(__pa(sptep));
2206 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2207 return;
2208
2209 __direct_pte_prefetch(vcpu, sp, sptep);
2210}
2211
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2212static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2213 int level, gfn_t gfn, pfn_t pfn)
2069{ 2214{
@@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2222 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
2078 0, write, 1, &pt_write, 2223 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2224 level, gfn, pfn, false, true);
2225 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2226 ++vcpu->stat.pf_fixed;
2081 break; 2227 break;
2082 } 2228 }
@@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2244 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2245 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2246 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2247 | shadow_user_mask | shadow_x_mask
2248 | shadow_accessed_mask);
2102 } 2249 }
2103 } 2250 }
2104 return pt_write; 2251 return pt_write;
2105} 2252}
2106 2253
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2254static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2255{
2109 char buf[1]; 2256 siginfo_t info;
2110 void __user *hva; 2257
2111 int r; 2258 info.si_signo = SIGBUS;
2259 info.si_errno = 0;
2260 info.si_code = BUS_MCEERR_AR;
2261 info.si_addr = (void __user *)address;
2262 info.si_addr_lsb = PAGE_SHIFT;
2112 2263
2113 /* Touch the page, so send SIGBUS */ 2264 send_sig_info(SIGBUS, &info, tsk);
2114 hva = (void __user *)gfn_to_hva(kvm, gfn);
2115 r = copy_from_user(buf, hva, 1);
2116} 2265}
2117 2266
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2267static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2268{
2120 kvm_release_pfn_clean(pfn); 2269 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2270 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2271 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2272 return 0;
2124 } else if (is_fault_pfn(pfn)) 2273 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2274 return -EFAULT;
@@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2328 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2329 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2330 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2331 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2332 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2333 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2334 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2335
2185 sp = page_header(root); 2336 sp = page_header(root);
@@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2373 return ret;
2223} 2374}
2224 2375
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2376static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2377{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2378 struct kvm_mmu_page *sp;
2230 int direct = 0; 2379 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2380
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2381 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2382 spin_lock(&vcpu->kvm->mmu_lock);
2383 kvm_mmu_free_some_pages(vcpu);
2384 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2385 1, ACC_ALL, NULL);
2386 ++sp->root_count;
2387 spin_unlock(&vcpu->kvm->mmu_lock);
2388 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2389 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2390 for (i = 0; i < 4; ++i) {
2391 hpa_t root = vcpu->arch.mmu.pae_root[i];
2392
2393 ASSERT(!VALID_PAGE(root));
2394 spin_lock(&vcpu->kvm->mmu_lock);
2395 kvm_mmu_free_some_pages(vcpu);
2396 sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
2397 PT32_ROOT_LEVEL, 1, ACC_ALL,
2398 NULL);
2399 root = __pa(sp->spt);
2400 ++sp->root_count;
2401 spin_unlock(&vcpu->kvm->mmu_lock);
2402 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2403 }
2404 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2405 } else
2406 BUG();
2407
2408 return 0;
2409}
2410
2411static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2412{
2413 struct kvm_mmu_page *sp;
2414 u64 pdptr, pm_mask;
2415 gfn_t root_gfn;
2416 int i;
2417
2418 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2419
2420 if (mmu_check_root(vcpu, root_gfn))
2421 return 1;
2422
2423 /*
2424 * Do we shadow a long mode page table? If so we need to
2425 * write-protect the guests page table root.
2426 */
2427 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2428 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2429
2238 ASSERT(!VALID_PAGE(root)); 2430 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2431
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2432 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2433 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2434 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2435 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2436 root = __pa(sp->spt);
2251 ++sp->root_count; 2437 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2438 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2439 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2440 return 0;
2255 } 2441 }
2256 direct = !is_paging(vcpu); 2442
2443 /*
2444 * We shadow a 32 bit page table. This may be a legacy 2-level
2445 * or a PAE 3-level page table. In either case we need to be aware that
2446 * the shadow page table may be a PAE or a long mode page table.
2447 */
2448 pm_mask = PT_PRESENT_MASK;
2449 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2450 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2451
2257 for (i = 0; i < 4; ++i) { 2452 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2453 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2454
2260 ASSERT(!VALID_PAGE(root)); 2455 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2456 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2457 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2458 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2459 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2460 continue;
2266 } 2461 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2462 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2463 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2464 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2465 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2466 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2467 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2468 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2469 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2470 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2471 root = __pa(sp->spt);
2282 ++sp->root_count; 2472 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2473 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2474
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2475 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2476 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2477 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2478
2479 /*
2480 * If we shadow a 32 bit page table with a long mode page
2481 * table we enter this path.
2482 */
2483 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2484 if (vcpu->arch.mmu.lm_root == NULL) {
2485 /*
2486 * The additional page necessary for this is only
2487 * allocated on demand.
2488 */
2489
2490 u64 *lm_root;
2491
2492 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2493 if (lm_root == NULL)
2494 return 1;
2495
2496 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2497
2498 vcpu->arch.mmu.lm_root = lm_root;
2499 }
2500
2501 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2502 }
2503
2288 return 0; 2504 return 0;
2289} 2505}
2290 2506
2507static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2508{
2509 if (vcpu->arch.mmu.direct_map)
2510 return mmu_alloc_direct_roots(vcpu);
2511 else
2512 return mmu_alloc_shadow_roots(vcpu);
2513}
2514
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2515static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2516{
2293 int i; 2517 int i;
2294 struct kvm_mmu_page *sp; 2518 struct kvm_mmu_page *sp;
2295 2519
2520 if (vcpu->arch.mmu.direct_map)
2521 return;
2522
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2523 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2524 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2525
2526 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2529 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2530 mmu_sync_children(vcpu, sp);
@@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2539 mmu_sync_children(vcpu, sp);
2311 } 2540 }
2312 } 2541 }
2542 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2543}
2314 2544
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2545void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2327 return vaddr; 2557 return vaddr;
2328} 2558}
2329 2559
2560static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2561 u32 access, u32 *error)
2562{
2563 if (error)
2564 *error = 0;
2565 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2566}
2567
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2568static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2569 u32 error_code)
2332{ 2570{
@@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2631 mmu_free_roots(vcpu);
2394} 2632}
2395 2633
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2634static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2635 struct kvm_mmu *context)
2397{ 2636{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2637 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2638 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2639 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2407 context->root_level = 0; 2644 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2645 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2646 context->root_hpa = INVALID_PAGE;
2647 context->direct_map = true;
2648 context->nx = false;
2410 return 0; 2649 return 0;
2411} 2650}
2412 2651
@@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu)
2422 mmu_free_roots(vcpu); 2661 mmu_free_roots(vcpu);
2423} 2662}
2424 2663
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2664static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2426 u64 addr, 2665{
2427 u32 err_code) 2666 return vcpu->arch.cr3;
2667}
2668
2669static void inject_page_fault(struct kvm_vcpu *vcpu)
2428{ 2670{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2671 vcpu->arch.mmu.inject_page_fault(vcpu);
2430} 2672}
2431 2673
2432static void paging_free(struct kvm_vcpu *vcpu) 2674static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2676 nonpaging_free(vcpu);
2435} 2677}
2436 2678
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2679static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2680{
2439 int bit7; 2681 int bit7;
2440 2682
2441 bit7 = (gpte >> 7) & 1; 2683 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2684 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2685}
2444 2686
2445#define PTTYPE 64 2687#define PTTYPE 64
@@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2692#include "paging_tmpl.h"
2451#undef PTTYPE 2693#undef PTTYPE
2452 2694
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2695static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2696 struct kvm_mmu *context,
2697 int level)
2454{ 2698{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2699 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2700 u64 exb_bit_rsvd = 0;
2458 2701
2459 if (!is_nx(vcpu)) 2702 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2703 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2704 switch (level) {
2462 case PT32_ROOT_LEVEL: 2705 case PT32_ROOT_LEVEL:
@@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2754 }
2512} 2755}
2513 2756
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2757static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2758 struct kvm_mmu *context,
2759 int level)
2515{ 2760{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2761 context->nx = is_nx(vcpu);
2762
2763 reset_rsvds_bits_mask(vcpu, context, level);
2517 2764
2518 ASSERT(is_pae(vcpu)); 2765 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2766 context->new_cr3 = paging_new_cr3;
@@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2526 context->root_level = level; 2773 context->root_level = level;
2527 context->shadow_root_level = level; 2774 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2775 context->root_hpa = INVALID_PAGE;
2776 context->direct_map = false;
2529 return 0; 2777 return 0;
2530} 2778}
2531 2779
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2780static int paging64_init_context(struct kvm_vcpu *vcpu,
2781 struct kvm_mmu *context)
2533{ 2782{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2783 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2784}
2537 2785
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2786static int paging32_init_context(struct kvm_vcpu *vcpu,
2787 struct kvm_mmu *context)
2539{ 2788{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2789 context->nx = false;
2790
2791 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2792
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2793 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2794 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2795 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2550 context->root_level = PT32_ROOT_LEVEL; 2800 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2801 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2802 context->root_hpa = INVALID_PAGE;
2803 context->direct_map = false;
2553 return 0; 2804 return 0;
2554} 2805}
2555 2806
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2807static int paging32E_init_context(struct kvm_vcpu *vcpu,
2808 struct kvm_mmu *context)
2557{ 2809{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2810 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2811}
2561 2812
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2813static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2814{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2815 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2816
2566 context->new_cr3 = nonpaging_new_cr3; 2817 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2818 context->page_fault = tdp_page_fault;
@@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2571 context->invlpg = nonpaging_invlpg; 2822 context->invlpg = nonpaging_invlpg;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2823 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2824 context->root_hpa = INVALID_PAGE;
2825 context->direct_map = true;
2826 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2827 context->get_cr3 = get_cr3;
2828 context->inject_page_fault = kvm_inject_page_fault;
2829 context->nx = is_nx(vcpu);
2574 2830
2575 if (!is_paging(vcpu)) { 2831 if (!is_paging(vcpu)) {
2832 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 2833 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 2834 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 2835 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2836 context->nx = is_nx(vcpu);
2837 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 2838 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 2839 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 2840 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2841 context->nx = is_nx(vcpu);
2842 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 2843 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 2844 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 2845 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 2846 context->nx = false;
2847 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 2848 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 2849 context->root_level = PT32_ROOT_LEVEL;
2590 } 2850 }
@@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 2852 return 0;
2593} 2853}
2594 2854
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 2855int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 2856{
2597 int r; 2857 int r;
2598
2599 ASSERT(vcpu); 2858 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2859 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 2860
2602 if (!is_paging(vcpu)) 2861 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 2862 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 2863 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 2864 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 2865 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 2866 r = paging32E_init_context(vcpu, context);
2608 else 2867 else
2609 r = paging32_init_context(vcpu); 2868 r = paging32_init_context(vcpu, context);
2610 2869
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 2870 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 2871 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 2872
2614 return r; 2873 return r;
2615} 2874}
2875EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2876
2877static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2878{
2879 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2880
2881 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
2882 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
2883 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
2884
2885 return r;
2886}
2887
2888static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
2889{
2890 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
2891
2892 g_context->get_cr3 = get_cr3;
2893 g_context->inject_page_fault = kvm_inject_page_fault;
2894
2895 /*
2896 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
2897 * translation of l2_gpa to l1_gpa addresses is done using the
2898 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
2899 * functions between mmu and nested_mmu are swapped.
2900 */
2901 if (!is_paging(vcpu)) {
2902 g_context->nx = false;
2903 g_context->root_level = 0;
2904 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
2905 } else if (is_long_mode(vcpu)) {
2906 g_context->nx = is_nx(vcpu);
2907 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
2908 g_context->root_level = PT64_ROOT_LEVEL;
2909 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2910 } else if (is_pae(vcpu)) {
2911 g_context->nx = is_nx(vcpu);
2912 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
2913 g_context->root_level = PT32E_ROOT_LEVEL;
2914 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
2915 } else {
2916 g_context->nx = false;
2917 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
2918 g_context->root_level = PT32_ROOT_LEVEL;
2919 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
2920 }
2921
2922 return 0;
2923}
2616 2924
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 2925static int init_kvm_mmu(struct kvm_vcpu *vcpu)
2618{ 2926{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 2927 vcpu->arch.update_pte.pfn = bad_pfn;
2620 2928
2621 if (tdp_enabled) 2929 if (mmu_is_nested(vcpu))
2930 return init_kvm_nested_mmu(vcpu);
2931 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 2932 return init_kvm_tdp_mmu(vcpu);
2623 else 2933 else
2624 return init_kvm_softmmu(vcpu); 2934 return init_kvm_softmmu(vcpu);
@@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 2963 if (r)
2654 goto out; 2964 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 2965 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2966 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 2967out:
2658 return r; 2968 return r;
2659} 2969}
@@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 2973{
2664 mmu_free_roots(vcpu); 2974 mmu_free_roots(vcpu);
2665} 2975}
2976EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 2977
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 2978static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 2979 struct kvm_mmu_page *sp,
@@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3006 return;
2696 } 3007 }
2697 3008
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 3009 if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return; 3010 return;
2700 3011
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3012 ++vcpu->kvm->stat.mmu_pte_updated;
@@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2837 kvm_mmu_access_page(vcpu, gfn); 3148 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3149 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3150 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3151 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3152 if (guest_initiated) {
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3153 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3154 && !last_updated_pte_accessed(vcpu)) {
@@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3221 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3222 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3223 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3224 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3225 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { 3226 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); 3227 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3234 gpa_t gpa;
2924 int r; 3235 int r;
2925 3236
2926 if (tdp_enabled) 3237 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3238 return 0;
2928 3239
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3240 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3248
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3249void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3250{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3251 LIST_HEAD(invalid_list);
2942 3252
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3253 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3254 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3255 struct kvm_mmu_page *sp;
2947 3256
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3257 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3258 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3259 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3260 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3261 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3262 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3263}
2956 3264
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3265int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3321static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3322{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3323 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3324 if (vcpu->arch.mmu.lm_root != NULL)
3325 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3326}
3017 3327
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3328static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3364 return init_kvm_mmu(vcpu);
3055} 3365}
3056 3366
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3367void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3368{
3068 struct kvm_mmu_page *sp; 3369 struct kvm_mmu_page *sp;
@@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3112{ 3413{
3113 struct kvm *kvm; 3414 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3415 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3416
3417 if (nr_to_scan == 0)
3418 goto out;
3116 3419
3117 spin_lock(&kvm_lock); 3420 spin_lock(&kvm_lock);
3118 3421
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3422 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3423 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3424 LIST_HEAD(invalid_list);
3122 3425
3123 idx = srcu_read_lock(&kvm->srcu); 3426 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3427 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3428 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3429 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3430 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3431 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3432 kvm_freed = kvm;
3133 } 3433 }
3134 nr_to_scan--; 3434 nr_to_scan--;
@@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3142 3442
3143 spin_unlock(&kvm_lock); 3443 spin_unlock(&kvm_lock);
3144 3444
3145 return cache_count; 3445out:
3446 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3447}
3147 3448
3148static struct shrinker mmu_shrinker = { 3449static struct shrinker mmu_shrinker = {
@@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void)
3163void kvm_mmu_module_exit(void) 3464void kvm_mmu_module_exit(void)
3164{ 3465{
3165 mmu_destroy_caches(); 3466 mmu_destroy_caches();
3467 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3166 unregister_shrinker(&mmu_shrinker); 3468 unregister_shrinker(&mmu_shrinker);
3167} 3469}
3168 3470
@@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3487 if (!mmu_page_header_cache)
3186 goto nomem; 3488 goto nomem;
3187 3489
3490 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3491 goto nomem;
3492
3188 register_shrinker(&mmu_shrinker); 3493 register_shrinker(&mmu_shrinker);
3189 3494
3190 return 0; 3495 return 0;
@@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3660}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3661EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3662
3358#ifdef AUDIT 3663#ifdef CONFIG_KVM_MMU_AUDIT
3359 3664#include "mmu_audit.c"
3360static const char *audit_msg; 3665#else
3361 3666static void mmu_audit_disable(void) { }
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif 3667#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438 3668
3439 if (is_error_pfn(pfn)) { 3669void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{
3554 struct kvm_mmu_page *sp;
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575
3576static void audit_rmap(struct kvm_vcpu *vcpu)
3577{
3578 check_writable_mappings_rmap(vcpu);
3579 count_rmaps(vcpu);
3580}
3581
3582static void audit_write_protection(struct kvm_vcpu *vcpu)
3583{
3584 struct kvm_mmu_page *sp;
3585 struct kvm_memory_slot *slot;
3586 unsigned long *rmapp;
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
3612{ 3670{
3613 int olddbg = dbg; 3671 ASSERT(vcpu);
3614 3672
3615 dbg = 0; 3673 destroy_kvm_mmu(vcpu);
3616 audit_msg = msg; 3674 free_mmu_pages(vcpu);
3617 audit_rmap(vcpu); 3675 mmu_free_memory_caches(vcpu);
3618 audit_write_protection(vcpu); 3676 mmu_audit_disable();
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3677}
3624
3625#endif